From 7534dd6b255d4d1715efc792fde0036a30310532 Mon Sep 17 00:00:00 2001 From: momo609 <963372609@qq.com> Date: Mon, 13 May 2024 10:11:05 +0800 Subject: [PATCH 01/33] fix npu bug. --- .../csrc/pytorch/npu/chamfer_distance_npu.cpp | 19 +++- mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp | 105 ++++++++++++++---- .../csrc/pytorch/npu/gather_points_npu.cpp | 16 ++- mmcv/ops/csrc/pytorch/npu/knn_npu.cpp | 4 +- mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp | 14 ++- .../csrc/pytorch/npu/stack_ball_query_npu.cpp | 5 +- .../pytorch/npu/three_interpolate_npu.cpp | 29 +++-- mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp | 14 +-- mmcv/ops/deform_conv.py | 2 +- mmcv/ops/fused_bias_leakyrelu.py | 2 +- mmcv/ops/knn.py | 18 +++ mmcv/ops/modulated_deform_conv.py | 6 +- mmcv/ops/nms.py | 4 +- mmcv/ops/points_in_boxes.py | 7 +- mmcv/ops/points_in_polygons.py | 2 + mmcv/ops/scatter_points.py | 36 +++++- mmcv/ops/three_nn.py | 15 +++ 17 files changed, 233 insertions(+), 65 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp index 8b30fa15df..4f5c32dbec 100644 --- a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp @@ -5,19 +5,34 @@ using namespace std; void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1, Tensor dist2, Tensor idx1, Tensor idx2) { + bool is_half = XYZ1.scalar_type() == at::kHalf; at::Tensor xyz1 = at::ones_like(XYZ1); at::Tensor xyz2 = at::ones_like(XYZ2); + at::Tensor distf1 = at::ones_like(dist1); + at::Tensor distf2 = at::ones_like(dist2); xyz1 = XYZ1.transpose(1, 2).transpose(0, 1); xyz2 = XYZ2.transpose(1, 2).transpose(0, 1); + if (is_half) { + xyz1 = xyz1.to(at::kFloat); + xyz2 = xyz2.to(at::kFloat); + distf1 = dist1.to(at::kFloat); + distf2 = dist2.to(at::kFloat); + } OpCommand cmd; cmd.Name("ChamferDistance") .Input(xyz1) .Input(xyz2) - .Output(dist1) - .Output(dist2) + .Output(distf1) + .Output(distf2) .Output(idx1) .Output(idx2) .Run(); + if (is_half) { + distf1 = distf1.to(at::kHalf); + distf2 = distf2.to(at::kHalf); + } + dist1.copy_(distf1); + dist2.copy_(distf2); } void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1, diff --git a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp index 5030fed0e7..3f3bc5a047 100644 --- a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp @@ -4,6 +4,21 @@ using namespace std; void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight, Tensor output, float gamma, float alpha) { + at::Tensor input_y = input; + at::Tensor output_y = output; + bool is_half = input.scalar_type() == at::kHalf; + if (is_half) { + input_y = input.to(at::kFloat); + output_y = output.to(at::kFloat); + } + int64_t weight_size = weight.size(0); + at::Tensor weight_y = at::ones_like(input_y); + if (weight_size > 0) { + weight_y = at::broadcast_to(weight, input.sizes()); + if (is_half) { + weight_y = weight_y.to(at::kFloat); + } + } int64_t n_class = input.size(1); at::Tensor target_y = at::ones_like(input); if (n_class == 1) { @@ -12,24 +27,26 @@ void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight, target_y = at::add(target_y, 1.0); } else { target_y = at::one_hot(target, n_class); + weight_y = at::mul(weight_y, target_y); + weight_y = at::sum(weight_y, 1, true); + weight_y = at::broadcast_to(weight_y, input.sizes()); } target_y = target_y.to(at::kInt); - int64_t weight_size = weight.size(0); - at::Tensor weight_y = at::ones_like(input); - if (weight_size > 0) { - weight_y = at::broadcast_to(weight, input.sizes()); - } OpCommand cmd; string reduction = "none"; cmd.Name("SigmoidFocalLoss") - .Input(input) + .Input(input_y) .Input(target_y) .Input(weight_y) - .Output(output) + .Output(output_y) .Attr("gamma", gamma) .Attr("alpha", alpha) .Attr("reduction", reduction) .Run(); + if (is_half) { + output_y = output_y.to(at::kHalf); + } + output.copy_(output_y); } void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight, @@ -38,34 +55,51 @@ void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight, Tensor grad_input, float gamma, float alpha) { + at::Tensor input_y = input; + at::Tensor grad_input_y = grad_input; + bool is_half = input.scalar_type() == at::kHalf; + if (is_half) { + input_y = input.to(at::kFloat); + grad_input_y = grad_input.to(at::kFloat); + } + int64_t weight_size = weight.size(0); + at::Tensor weight_y = at::ones_like(input_y); + if (weight_size > 0) { + weight_y = at::broadcast_to(weight, input.sizes()); + if (is_half) { + weight_y = weight_y.to(at::kFloat); + } + } int64_t n_class = input.size(1); at::Tensor target_y = at::ones_like(input); if (n_class == 1) { target_y = at::reshape(target, input.sizes()); } else { target_y = at::one_hot(target, n_class); + weight_y = at::mul(weight_y, target_y); + weight_y = at::sum(weight_y, 1, true); + weight_y = at::broadcast_to(weight_y, input.sizes()); target_y = at::mul(target_y, -1.0); target_y = at::add(target_y, 1.0); } target_y = target_y.to(at::kInt); at::Tensor grad_up = at::ones_like(input); - int64_t weight_size = weight.size(0); - at::Tensor weight_y = at::ones_like(input); - if (weight_size > 0) { - weight_y = at::broadcast_to(weight, input.sizes()); - } OpCommand cmd; string reduction = "none"; cmd.Name("SigmoidFocalLossGrad") - .Input(input) + .Input(input_y) .Input(target_y) .Input(grad_up) .Input(weight_y) - .Output(grad_input) + .Output(grad_input_y) .Attr("gamma", gamma) .Attr("alpha", alpha) .Attr("reduction", reduction) .Run(); + if (is_half) { + grad_input_y = grad_input_y.to(at::kHalf); + } + grad_input.copy_(grad_input_y); } void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target, @@ -74,19 +108,30 @@ void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target, void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight, Tensor output, float gamma, float alpha) { + at::Tensor input_y = input; + bool is_half = input.scalar_type() == at::kHalf; + if (is_half) { + input_y = input.to(at::kFloat); + } int64_t n_class = input.size(1); at::Tensor target_y = at::one_hot(target, n_class); target_y = target_y.to(at::kInt); int64_t weight_size = weight.size(0); - at::Tensor weight_y = at::ones_like(input); + at::Tensor weight_y = at::ones_like(input_y); if (weight_size > 0) { weight_y = at::broadcast_to(weight, input.sizes()); + if (is_half) { + weight_y = weight_y.to(at::kFloat); + } + weight_y = at::mul(weight_y, target_y); + weight_y = at::sum(weight_y, 1, true); + weight_y = at::broadcast_to(weight_y, input.sizes()); } - at::Tensor op_output = at::ones_like(input); + at::Tensor op_output = at::ones_like(input_y); OpCommand cmd; string reduction = "none"; cmd.Name("SoftmaxFocalLoss") - .Input(input) + .Input(input_y) .Input(target_y) .Input(weight_y) .Output(op_output) @@ -94,6 +139,9 @@ void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight, .Attr("alpha", alpha) .Attr("reduction", reduction) .Run(); + if (is_half) { + op_output = op_output.to(at::kHalf); + } int64_t n_batch = input.size(0); c10::SmallVector offsets = {0, 0}; c10::SmallVector sizes = {n_batch, 1}; @@ -124,27 +172,44 @@ void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight, Tensor buff, Tensor grad_input, float gamma, float alpha) { + at::Tensor input_y = input; + at::Tensor grad_input_y = grad_input; + bool is_half = input.scalar_type() == at::kHalf; + if (is_half) { + input_y = input.to(at::kFloat); + grad_input_y = grad_input.to(at::kFloat); + } int64_t n_class = input.size(1); at::Tensor target_y = at::one_hot(target, n_class); target_y = target_y.to(at::kInt); at::Tensor grad_up = at::ones_like(input); int64_t weight_size = weight.size(0); - at::Tensor weight_y = at::ones_like(input); + at::Tensor weight_y = at::ones_like(input_y); if (weight_size > 0) { weight_y = at::broadcast_to(weight, input.sizes()); + if (is_half) { + weight_y = weight_y.to(at::kFloat); + } + weight_y = at::mul(weight_y, target_y); + weight_y = at::sum(weight_y, 1, true); + weight_y = at::broadcast_to(weight_y, input.sizes()); } OpCommand cmd; string reduction = "none"; cmd.Name("SoftmaxFocalLossGrad") - .Input(input) + .Input(input_y) .Input(target_y) .Input(grad_up) .Input(weight_y) - .Output(grad_input) + .Output(grad_input_y) .Attr("gamma", gamma) .Attr("alpha", alpha) .Attr("reduction", reduction) .Run(); + if (is_half) { + grad_input_y = grad_input_y.to(at::kHalf); + } + grad_input.copy_(grad_input_y); } void softmax_focal_loss_backward_impl(Tensor input, Tensor target, diff --git a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp index 747380fb09..279f14008b 100644 --- a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp @@ -24,6 +24,12 @@ void gather_points_forward_npu(int b, int c, int n, int npoints, void gather_points_backward_npu(int b, int c, int n, int npoints, const Tensor grad_out, const Tensor idx, Tensor grad_points) { + at::Tensor grad_out_cast = grad_out; + at::Tensor grad_points_cast = grad_points; + if (grad_out.scalar_type() == at::ScalarType::Half) { + grad_out_cast = grad_out.to(at::kFloat); + grad_points_cast = grad_points.to(at::kFloat); + } at::Tensor indices = idx; if (idx.scalar_type() != at::ScalarType::Int) { indices = idx.to(at::kInt); @@ -37,11 +43,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints, for (uint64_t i = 0; i < shape.size(); i++) { pad_size.emplace_back(shape[i]); } - at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous(); + at::Tensor trans_grad_points = grad_points_cast.transpose(1, 2).contiguous(); at::Tensor grad_points_view = trans_grad_points.view( {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1], trans_grad_points.sizes()[2]}); - at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous(); + at::Tensor trans_grad_out = grad_out_cast.transpose(1, 2).contiguous(); trans_grad_out = trans_grad_out.view( {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1], trans_grad_out.sizes()[2]}); @@ -63,7 +69,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints, at::Tensor grad_points_result = grad_points_view.view(trans_grad_points.sizes()); grad_points_result = grad_points_result.transpose(1, 2); - grad_points.copy_(grad_points_result); + at::Tensor grad_points_result_cast = grad_points_result; + if (grad_out.scalar_type() == at::ScalarType::Half) { + grad_points_result_cast = grad_points_result.to(at::kHalf); + } + grad_points.copy_(grad_points_result_cast); } void gather_points_forward_impl(int b, int c, int n, int npoints, diff --git a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp index f25f9cf623..c4a1bcbd25 100644 --- a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp @@ -8,11 +8,11 @@ using namespace std; void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz, const Tensor new_xyz, Tensor idx, Tensor dist2) { // transpose known from [B, N, 3] to [B, 3, N] - at::Tensor source = xyz.transpose(1, 2).contiguous(); + at::Tensor source = xyz.transpose(2, 1).contiguous(); at::Tensor target = new_xyz.contiguous(); bool is_from_knn = true; - EXEC_NPU_CMD(aclnnKnn, source, target, nsample, is_from_knn, idx, dist2); + EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2); } void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz, diff --git a/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp b/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp index c7a11e8c6d..b7015439b9 100644 --- a/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp @@ -50,23 +50,29 @@ void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax, int64_t pooled_height_64 = pooled_height; int64_t pooled_width_64 = pooled_width; int64_t pooled_channel = 1; + at::Tensor argmax_trans = argmax.transpose(1, 2).transpose(2, 3); + at::Tensor grad_output_trans = grad_output.transpose(1, 2).transpose(2, 3); at::Tensor roi_actual_num = at::empty_like(rois, rois.options().dtype(at::kInt)); - at::Tensor x = at::ones_like(grad_input); + at::Tensor x = at::ones_like(grad_input).transpose(1, 2).transpose(2, 3); + at::Tensor y = at::zeros_like(x); OpCommand cmd; cmd.Name("RoiPoolingGradWithArgMax") - .Input(grad_output) + .Input(grad_output_trans) .Input(x) .Input(rois) .Input(roi_actual_num) - .Input(argmax) - .Output(grad_input) + .Input(argmax_trans) + .Output(y) .Attr("pooled_h", pooled_height_64) .Attr("pooled_w", pooled_width_64) .Attr("spatial_scale_h", spatial_scale) .Attr("spatial_scale_w", spatial_scale) .Attr("pool_channel", pooled_channel) .Run(); + at::Tensor result = y.transpose(2, 3).transpose(1, 2); + at::Tensor res = result.contiguous(); + grad_input.copy_(res); } void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output, diff --git a/mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp b/mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp index cd8c3ad8c9..92627df6e3 100644 --- a/mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp @@ -8,9 +8,10 @@ void stack_ball_query_forward_npu(float max_radius, int nsample, const Tensor new_xyz_batch_cnt, const Tensor xyz, const Tensor xyz_batch_cnt, Tensor idx) { - at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous(); + at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous().to(at::kFloat); + at::Tensor new_xyz_fp32 = new_xyz.to(at::kFloat); double max_radius_double = double(max_radius); - EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt, + EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz_fp32, xyz_batch_cnt, new_xyz_batch_cnt, max_radius_double, nsample, idx); } diff --git a/mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp b/mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp index f908755478..42d346f7d2 100644 --- a/mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp @@ -12,17 +12,21 @@ void three_interpolate_forward_npu(int b, int c, int m, int n, TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf), "three_interpolate_forward ascend only support fp32 and fp16."); - auto point_c_trans = points.transpose(1, 2); - + auto point_c_trans = points.transpose(1, 2).to(at::kFloat); + auto weight_cast = weight.to(at::kFloat); + auto out_cast = out.to(at::kFloat); OpCommand cmd; cmd.Name("ThreeInterpolate") .Input(point_c_trans) .Input(idx) - .Input(weight) - .Output(out) + .Input(weight_cast) + .Output(out_cast) .Run(); - auto output = out.view({b, n, c}).transpose(1, 2); + if (originDtype == at::kHalf) { + out_cast = out_cast.to(at::kHalf); + } + auto output = out_cast.view({b, n, c}).transpose(1, 2); auto res = output.contiguous(); out.copy_(res); } @@ -34,12 +38,17 @@ void three_interpolate_backward_npu(int b, int c, int n, int m, TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf), "three_interpolate_backward ascend only support fp32 and fp16."); - auto grad_x = at::unsqueeze(grad_out, 3); - auto grad_y = at::unsqueeze(grad_points, 3); - - EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight, m, grad_y); + auto grad_x = at::unsqueeze(grad_out, 3).to(at::kFloat); + auto grad_y = at::unsqueeze(grad_points, 3).to(at::kFloat); + auto weight_cast = weight.to(at::kFloat); + EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight_cast, m, + grad_y); - auto output = at::squeeze(grad_y, 3); + auto grad_y_cast = grad_y; + if (originDtype == at::kHalf) { + grad_y_cast = grad_y.to(at::kHalf); + } + auto output = at::squeeze(grad_y_cast, 3); auto res = output.contiguous(); grad_points.copy_(res); } diff --git a/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp index 9766816f6c..6740a731bc 100644 --- a/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp @@ -7,21 +7,11 @@ using namespace std; void three_nn_forward_npu(int b, int n, int m, const Tensor unknown, const Tensor known, Tensor dist2, Tensor idx) { - // transpose known [B, N, 3] -> [B, 3, N] - at::Tensor source = known.transpose(1, 2).contiguous(); + at::Tensor source = known.contiguous(); at::Tensor target = unknown.contiguous(); - auto originDtype = source.scalar_type(); - if (originDtype == at::kHalf) { - source = source.to(at::kFloat); - target = target.to(at::kFloat); - } bool is_from_knn = false; - uint32_t nsample = 3; - EXEC_NPU_CMD(aclnnKnn, source, target, nsample, is_from_knn, idx, dist2); - if (originDtype == at::kHalf) { - dist2 = dist2.to(at::kHalf); - } + EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2); } void three_nn_forward_impl(int b, int n, int m, const Tensor unknown, diff --git a/mmcv/ops/deform_conv.py b/mmcv/ops/deform_conv.py index c6cbba6779..856e3e36b6 100644 --- a/mmcv/ops/deform_conv.py +++ b/mmcv/ops/deform_conv.py @@ -57,7 +57,7 @@ def _npu_backward(ctx, grad_output): grad_input, grad_weight, grad_offset_all, grad_bias = \ torch_npu.npu_deformable_conv2dbk( input_tensor, grad_output, offset_out, weight, offset_all, - kernel_size=[weight.shape[3], weight.shape[2]], + kernel_size=[weight.shape[2], weight.shape[3]], stride=[1, 1, ctx.stride[0], ctx.stride[1]], padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1]], diff --git a/mmcv/ops/fused_bias_leakyrelu.py b/mmcv/ops/fused_bias_leakyrelu.py index e23617fb3a..fe17d2db7b 100644 --- a/mmcv/ops/fused_bias_leakyrelu.py +++ b/mmcv/ops/fused_bias_leakyrelu.py @@ -258,7 +258,7 @@ def fused_bias_leakyrelu(input: torch.Tensor, torch.Tensor: Feature map after non-linear activation. """ - if not input.is_cuda: + if not input.is_cuda and input.device.type != 'npu': return bias_leakyrelu_ref(input, bias, negative_slope, scale) return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype), diff --git a/mmcv/ops/knn.py b/mmcv/ops/knn.py index 47ced04c6a..1e2a68d1d2 100644 --- a/mmcv/ops/knn.py +++ b/mmcv/ops/knn.py @@ -62,6 +62,24 @@ def forward(ctx, B, npoint, _ = center_xyz.shape N = xyz.shape[1] + if xyz.device.type == 'npu': + dist = center_xyz.new_zeros((B, npoint, N)).float() + ext_module.knn_forward( + xyz, + center_xyz, + torch.Tensor([]).npu(), + dist, + b=B, + n=N, + m=npoint, + nsample=k) + dist2, idx = torch.topk(dist, k, dim=2, largest=False, sorted=True) + zeros_idx = torch.zeros( + xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu() + idx.where(dist2 >= 1e10, zeros_idx) + idx = idx.transpose(2, 1).contiguous() # [B, k, npoint] + return idx.int() + idx = center_xyz.new_zeros((B, npoint, k)).int() dist2 = center_xyz.new_zeros((B, npoint, k)).float() diff --git a/mmcv/ops/modulated_deform_conv.py b/mmcv/ops/modulated_deform_conv.py index 0c169009a5..b6e8c6d40a 100644 --- a/mmcv/ops/modulated_deform_conv.py +++ b/mmcv/ops/modulated_deform_conv.py @@ -55,7 +55,7 @@ def _npu_forward(ctx, input_tensor, offset, mask, weight, bias): conv2d_bias = bias if len(bias) > 0 else None sort_index_fp, sort_index_bp = \ ModulatedDeformConv2dFunction._calculate_sort_index( - kernel_w, kernel_h, ctx.deform_groups) + kernel_h, kernel_w, ctx.deform_groups) select_offset = offset.index_select(1, sort_index_fp) offset_all = torch.cat([select_offset, mask], dim=1) import torch_npu @@ -64,7 +64,7 @@ def _npu_forward(ctx, input_tensor, offset, mask, weight, bias): weight, offset_all, conv2d_bias, - kernel_size=[kernel_w, kernel_h], + kernel_size=[kernel_h, kernel_w], stride=[1, 1, ctx.stride[0], ctx.stride[1]], padding=[ ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1] @@ -87,7 +87,7 @@ def _npu_backward(ctx, grad_output): grad_input, grad_weight, grad_offset_all, grad_bias = \ torch_npu.npu_deformable_conv2dbk( input_tensor, grad_output, offset_out, weight, offset_all, - kernel_size=[weight.shape[3], weight.shape[2]], + kernel_size=[weight.shape[2], weight.shape[3]], stride=[1, 1, ctx.stride[0], ctx.stride[1]], padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1]], diff --git a/mmcv/ops/nms.py b/mmcv/ops/nms.py index fb08ba07c6..0c6adfabc7 100644 --- a/mmcv/ops/nms.py +++ b/mmcv/ops/nms.py @@ -405,7 +405,7 @@ def nms_rotated(dets: Tensor, flip_mat[-1] = -1 dets_cw = dets * flip_mat else: - dets_cw = dets + dets_cw = dets.clone() multi_label = labels is not None if labels is None: input_labels = scores.new_empty(0, dtype=torch.int) @@ -415,6 +415,8 @@ def nms_rotated(dets: Tensor, order = scores.new_empty(0, dtype=torch.long) if dets.device.type == 'npu': coefficient = 57.29578 # 180 / PI + dets_cw = dets_cw.float() + scores = scores.float() for i in range(dets.size()[0]): dets_cw[i][4] *= coefficient # radians to angle keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw, diff --git a/mmcv/ops/points_in_boxes.py b/mmcv/ops/points_in_boxes.py index 4915e6b573..23c35da4eb 100644 --- a/mmcv/ops/points_in_boxes.py +++ b/mmcv/ops/points_in_boxes.py @@ -47,8 +47,11 @@ def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor: points_device = points.get_device() assert points_device == boxes.get_device(), \ 'Points and boxes should be put on the same device' - if torch.cuda.current_device() != points_device: - torch.cuda.set_device(points_device) + if points.device.type != 'npu': + if torch.cuda.current_device() != points_device: + torch.cuda.set_device(points_device) + elif points.device.type == 'npu': + boxes[:, :, 2] += boxes[:, :, 5] / 2.0 ext_module.points_in_boxes_part_forward(boxes.contiguous(), points.contiguous(), diff --git a/mmcv/ops/points_in_polygons.py b/mmcv/ops/points_in_polygons.py index e54b5a896d..8d3bc8dd48 100644 --- a/mmcv/ops/points_in_polygons.py +++ b/mmcv/ops/points_in_polygons.py @@ -19,6 +19,8 @@ def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor: polygons (torch.Tensor): It has shape (M, 8), indicating (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of ground truth polygons. + constraints: The number of significant digits for the input-arguments + are between -10 and 10 when running on Ascend device. Returns: torch.Tensor: Return the result with the shape of (B, M), diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py index 5d881bfe63..6902343a03 100644 --- a/mmcv/ops/scatter_points.py +++ b/mmcv/ops/scatter_points.py @@ -36,10 +36,29 @@ def forward(ctx: Any, reduced from input features that share the same voxel coordinates. The second is voxel coordinates with shape [M, ndim]. """ + ctx.device = feats.device.type + if ctx.device == 'npu': + import ads_c + voxel_idx = ads_c.point_to_voxel(coors, None, None) + unique_res = ads_c.unique_voxel(voxel_idx) + num_voxels, uniqued_voxel_idx, prefix_sum, \ + argsort_coor = unique_res + voxel_coors = ads_c.voxel_to_point(uniqued_voxel_idx, None, None) + voxel_feats, \ + compare_mask = ads_c.npu_dynamic_scatter(feats, coors, + prefix_sum, + argsort_coor, + num_voxels, + reduce_type) + ctx.reduce_type = reduce_type + ctx.feats_shape = feats.shape + ctx.save_for_backward(prefix_sum, argsort_coor, compare_mask) + ctx.mark_non_differentiable(voxel_coors) + return voxel_feats, voxel_coors + results = ext_module.dynamic_point_to_voxel_forward( feats, coors, reduce_type) - (voxel_feats, voxel_coors, point2voxel_map, - voxel_points_count) = results + voxel_feats, voxel_coors, point2voxel_map, voxel_points_count = results ctx.reduce_type = reduce_type ctx.save_for_backward(feats, voxel_feats, point2voxel_map, voxel_points_count) @@ -50,6 +69,19 @@ def forward(ctx: Any, def backward(ctx: Any, grad_voxel_feats: torch.Tensor, grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple: + if ctx.device == 'npu': + import ads_c + prefix_sum, argsort_coor, compare_mask = ctx.saved_tensors + grad_point_feats = torch.zeros( + ctx.feats_shape, + dtype=grad_voxel_feats.dtype, + device=grad_voxel_feats.device) + ads_c.npu_dynamic_scatter_grad(grad_point_feats, + grad_voxel_feats.contiguous(), + prefix_sum, argsort_coor, + compare_mask, ctx.reduce_type) + return grad_point_feats, None, None + (feats, voxel_feats, point2voxel_map, voxel_points_count) = ctx.saved_tensors grad_feats = torch.zeros_like(feats) diff --git a/mmcv/ops/three_nn.py b/mmcv/ops/three_nn.py index d41b9789cf..52d504609a 100644 --- a/mmcv/ops/three_nn.py +++ b/mmcv/ops/three_nn.py @@ -34,6 +34,21 @@ def forward(ctx: Any, target: torch.Tensor, B, N, _ = target.size() m = source.size(1) + if source.device.type == 'npu': + # strict to fp32 + source = source.transpose(2, 1).contiguous() + dtype_ = source.dtype + if dtype_ == torch.float16: + target = target.float() + source = source.float() + dist = target.new_empty(B, N, m) + ext_module.three_nn_forward( + target, source, dist, torch.Tensor([]).npu(), b=B, n=N, m=m) + dist2, idx = torch.topk(dist, 3, dim=2, largest=False, sorted=True) + dist2 = torch.sqrt(dist2) + if dtype_ == torch.float16: + dist2 = dist2.half() + return dist2, idx.int() dist2 = target.new_empty(B, N, 3) idx = target.new_empty(B, N, 3, dtype=torch.int32) From ef10dc7c0f8fb58fff22965fd23ecac3cadf6bda Mon Sep 17 00:00:00 2001 From: ZYF-Annarine Date: Wed, 19 Jun 2024 14:47:57 +0800 Subject: [PATCH 02/33] modify chamfer --- .../csrc/pytorch/npu/chamfer_distance_npu.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp index 4f5c32dbec..170a5fa72a 100644 --- a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp @@ -6,17 +6,17 @@ using namespace std; void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1, Tensor dist2, Tensor idx1, Tensor idx2) { bool is_half = XYZ1.scalar_type() == at::kHalf; - at::Tensor xyz1 = at::ones_like(XYZ1); - at::Tensor xyz2 = at::ones_like(XYZ2); - at::Tensor distf1 = at::ones_like(dist1); - at::Tensor distf2 = at::ones_like(dist2); + at::Tensor xyz1 = XYZ1; + at::Tensor xyz2 = XYZ2; + at::Tensor distf1 = dist1; + at::Tensor distf2 = dist2; xyz1 = XYZ1.transpose(1, 2).transpose(0, 1); xyz2 = XYZ2.transpose(1, 2).transpose(0, 1); if (is_half) { xyz1 = xyz1.to(at::kFloat); xyz2 = xyz2.to(at::kFloat); - distf1 = dist1.to(at::kFloat); - distf2 = dist2.to(at::kFloat); + distf1 = distf1.to(at::kFloat); + distf2 = distf2.to(at::kFloat); } OpCommand cmd; cmd.Name("ChamferDistance") @@ -31,8 +31,8 @@ void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1, distf1 = distf1.to(at::kHalf); distf2 = distf2.to(at::kHalf); } - dist1.copy_(distf1); - dist2.copy_(distf2); + dist1 = distf1; + dist2 = distf2; } void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1, From 46182c61aabad8f572f1ea823079ab8c908b98e7 Mon Sep 17 00:00:00 2001 From: Pr0Wh1teGivee Date: Tue, 18 Jun 2024 17:26:07 +0800 Subject: [PATCH 03/33] Add NPU support for dynamic voxelization Update voxelization_npu.cpp Update test_voxelization.py Update voxelization_npu.cpp Update voxelization_npu.cpp Update voxelization_npu.cpp --- .../ops/csrc/pytorch/npu/voxelization_npu.cpp | 35 +++++++++++++++++++ tests/test_ops/test_voxelization.py | 15 ++++++++ 2 files changed, 50 insertions(+) diff --git a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp b/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp index ffd9b4c43b..2abe7c8f95 100644 --- a/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp @@ -11,6 +11,11 @@ int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels, const int max_points, const int max_voxels, const int NDim = 3); +void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors, + const std::vector voxel_size, + const std::vector coors_range, + const int NDim = 3); + int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors, at::Tensor &num_points_per_voxel, @@ -53,4 +58,34 @@ int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels, return voxel_num_int; } +void dynamic_voxelize_forward_npu(const at::Tensor &points, at::Tensor &coors, + const std::vector voxel_size, + const std::vector coors_range, + const int NDim = 3) { + uint32_t ptsNum = points.size(0); + uint32_t ptsFeature = points.size(1); + at::Tensor ptsTrans = at::transpose(points, 0, 1); + double coors_min_x = coors_range[0]; + double coors_min_y = coors_range[1]; + double coors_min_z = coors_range[2]; + double coors_max_x = coors_range[3]; + double coors_max_y = coors_range[4]; + double coors_max_z = coors_range[5]; + double voxel_x = voxel_size[0]; + double voxel_y = voxel_size[1]; + double voxel_z = voxel_size[2]; + int grid_x = std::round((coors_max_x - coors_min_x) / voxel_x); + int grid_y = std::round((coors_max_y - coors_min_y) / voxel_y); + int grid_z = std::round((coors_max_z - coors_min_z) / voxel_z); + + at::Tensor tmp_coors = + at::zeros({3, ptsNum}, points.options().dtype(at::kInt)); + EXEC_NPU_CMD(aclnnDynamicVoxelization, ptsTrans, coors_min_x, coors_min_y, + coors_min_z, voxel_x, voxel_y, voxel_z, grid_x, grid_y, grid_z, + tmp_coors); + tmp_coors.transpose_(0, 1); + coors.copy_(tmp_coors); +} + REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu); +REGISTER_NPU_IMPL(dynamic_voxelize_forward_impl, dynamic_voxelize_forward_npu); diff --git a/tests/test_ops/test_voxelization.py b/tests/test_ops/test_voxelization.py index 78282a8ad0..c31224913e 100644 --- a/tests/test_ops/test_voxelization.py +++ b/tests/test_ops/test_voxelization.py @@ -200,6 +200,9 @@ def test_voxelization_npu(device_type): points = voxel_dict['points'] points = torch.tensor(points) + max_num_points = -1 + dynamic_voxelization = Voxelization(voxel_size, point_cloud_range, + max_num_points) max_num_points = 1000 hard_voxelization = Voxelization(voxel_size, point_cloud_range, max_num_points) @@ -215,3 +218,15 @@ def test_voxelization_npu(device_type): assert np.all(coors == expected_coors) assert np.all(voxels == expected_voxels) assert np.all(num_points_per_voxel == expected_num_points_per_voxel) + + # test dynamic_voxelization on npu + coors = dynamic_voxelization.forward(points) + coors = coors.cpu().detach().numpy() + points = points.cpu().detach().numpy() + for i in range(expected_voxels.shape[0]): + indices = _get_voxel_points_indices(points, coors, expected_voxels[i]) + num_points_current_voxel = points[indices].shape[0] + assert num_points_current_voxel > 0 + assert np.all( + points[indices] == expected_coors[i][:num_points_current_voxel]) + assert num_points_current_voxel == expected_num_points_per_voxel[i] From b238850bfdc7bc00ebdbf3f786d4f7dedef420ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E8=B1=AA?= Date: Wed, 19 Jun 2024 16:55:12 +0800 Subject: [PATCH 04/33] Bugfix of NPU adapter of nms3d --- mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp | 17 +++++++++-------- mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp | 18 +++++++++++------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp b/mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp index 5d812fe047..6d2588a01d 100644 --- a/mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/nms3d_normal_npu.cpp @@ -3,23 +3,24 @@ using namespace NPU_NAME_SPACE; void iou3d_nms3d_normal_forward_npu(const Tensor boxes, Tensor &keep, - Tensor &keep_num, - float nms_overlap_thresh) { + Tensor &num_out, float nms_overlap_thresh) { int32_t box_num = boxes.size(0); int32_t data_align = 16; int32_t mask_num = ((box_num - 1) / data_align + 1) * data_align; + const double iou_threshold = nms_overlap_thresh; at::Tensor mask = at::empty({box_num, mask_num}, boxes.options().dtype(at::kShort)); - EXEC_NPU_CMD(aclnnNms3dNormal, boxes, nms_overlap_thresh, mask); + EXEC_NPU_CMD(aclnnNms3dNormal, boxes, iou_threshold, mask); - keep = at::zeros({box_num}, mask.options()); - keep_num = at::zeros(1, mask.options()); - EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep, keep_num); + Tensor keep_t = at::zeros({box_num}, mask.options()); + Tensor num_out_t = at::zeros(1, mask.options()); + EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep_t, num_out_t); + num_out.fill_(num_out_t.item().toLong()); + keep.copy_(keep_t); } void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep, - Tensor &keep_num, - float nms_overlap_thresh); + Tensor &num_out, float nms_overlap_thresh); REGISTER_NPU_IMPL(iou3d_nms3d_normal_forward_impl, iou3d_nms3d_normal_forward_npu); diff --git a/mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp b/mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp index 13fe6db860..a143ed07b5 100644 --- a/mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/nms3d_npu.cpp @@ -5,22 +5,26 @@ using namespace std; constexpr int32_t BOX_DIM = 7; -void iou3d_nms3d_forward_npu(const Tensor boxes, Tensor &keep, Tensor &keep_num, +void iou3d_nms3d_forward_npu(const Tensor boxes, Tensor &keep, Tensor &num_out, float nms_overlap_thresh) { TORCH_CHECK((boxes.sizes()[1] == BOX_DIM), "Input boxes shape should be (N, 7)"); int32_t box_num = boxes.size(0); int32_t data_align = 16; int32_t mask_num = ((box_num - 1) / data_align + 1) * data_align; + const double iou_threshold = nms_overlap_thresh; at::Tensor mask = at::empty({box_num, mask_num}, boxes.options().dtype(at::kShort)); - EXEC_NPU_CMD(aclnnNms3d, boxes, nms_overlap_thresh, mask); - keep = at::zeros({box_num}, mask.options()); - keep_num = at::zeros(1, mask.options()); - EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep, keep_num); + EXEC_NPU_CMD(aclnnNms3d, boxes, iou_threshold, mask); + + Tensor keep_t = at::zeros({box_num}, mask.options()); + Tensor num_out_t = at::zeros(1, mask.options()); + EXEC_NPU_CMD(aclnnGatherNms3dMask, mask, keep_t, num_out_t); + num_out.fill_(num_out_t.item().toLong()); + keep.copy_(keep_t); } -void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep, - Tensor &keep_num, float nms_overlap_thresh); +void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep, Tensor &num_out, + float nms_overlap_thresh); REGISTER_NPU_IMPL(iou3d_nms3d_forward_impl, iou3d_nms3d_forward_npu); From 14f031a4eaad1299e75b81dc3ec9075e347a3a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=83=A1=E5=AE=8F=E7=AC=8B?= Date: Tue, 18 Jun 2024 09:31:27 +0800 Subject: [PATCH 05/33] repair nms_rotated bug --- mmcv/ops/nms.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/mmcv/ops/nms.py b/mmcv/ops/nms.py index 0c6adfabc7..2f1d27c4d2 100644 --- a/mmcv/ops/nms.py +++ b/mmcv/ops/nms.py @@ -405,23 +405,37 @@ def nms_rotated(dets: Tensor, flip_mat[-1] = -1 dets_cw = dets * flip_mat else: - dets_cw = dets.clone() + dets_cw = dets multi_label = labels is not None if labels is None: input_labels = scores.new_empty(0, dtype=torch.int) else: input_labels = labels - if dets.device.type in ('npu', 'mlu'): + + if dets.device.type == 'mlu': + order = scores.new_empty(0, dtype=torch.long) + keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw, + input_labels, iou_threshold, + multi_label) + dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)), + dim=1) + return dets, keep_inds + + if dets.device.type == 'npu': order = scores.new_empty(0, dtype=torch.long) - if dets.device.type == 'npu': - coefficient = 57.29578 # 180 / PI + coefficient = 57.29578 # 180 / PI + if dets.dtype == torch.float16: dets_cw = dets_cw.float() - scores = scores.float() - for i in range(dets.size()[0]): - dets_cw[i][4] *= coefficient # radians to angle + else: + dets_cw = dets_cw.clone() + for i in range(dets.size()[0]): + dets_cw[i][4] *= coefficient # radians to angle + scores = scores.float() keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw, input_labels, iou_threshold, multi_label) + if dets.dtype == torch.float16: + scores = scores.half() dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)), dim=1) return dets, keep_inds From ca0e42760b5382865bfad48958f12f250930c47d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 21 Jun 2024 12:02:10 +0800 Subject: [PATCH 06/33] add dtype check for roi_align --- mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp b/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp index 0e673614fa..12a6f65c04 100644 --- a/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp @@ -7,6 +7,8 @@ void roi_align_forward_npu(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { + TORCH_CHECK(input.scalar_type() == at::kFloat, + "input should be a float tensor"); int64_t roi_end_mode = 2; if (!aligned) { LOG(WARNING) << "The [aligned] attr in roi_align op is false"; @@ -33,6 +35,8 @@ void roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { + TORCH_CHECK(grad_output.scalar_type() == at::kFloat, + "input should be a float tensor"); int64_t aligned_height_64 = aligned_height; int64_t aligned_width_64 = aligned_width; int64_t sampling_ratio_64 = sampling_ratio; From 2ed5eb51ee96dffe536b2d4dcfee784fc20356df Mon Sep 17 00:00:00 2001 From: zhuweichen Date: Fri, 21 Jun 2024 15:44:08 +0800 Subject: [PATCH 07/33] adapt box_iou_rotaed to boxes_overlap_bev Update box_iou_rotated_npu.cpp --- mmcv/ops/box_iou_rotated.py | 9 +++++---- mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp | 8 ++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/mmcv/ops/box_iou_rotated.py b/mmcv/ops/box_iou_rotated.py index a811531d42..0f8a8e298b 100644 --- a/mmcv/ops/box_iou_rotated.py +++ b/mmcv/ops/box_iou_rotated.py @@ -143,10 +143,11 @@ def box_iou_rotated(bboxes1: torch.Tensor, bboxes1 = bboxes1 * flip_mat bboxes2 = bboxes2 * flip_mat if bboxes1.device.type == 'npu': - scale_mat = bboxes1.new_ones(bboxes1.shape[-1]) - scale_mat[-1] = 1.0 / 0.01745329252 - bboxes1 = bboxes1 * scale_mat - bboxes2 = bboxes2 * scale_mat + if (mode_flag == 1 or aligned or not clockwise): + scale_mat = bboxes1.new_ones(bboxes1.shape[-1]) + scale_mat[-1] = 1.0 / 0.01745329252 + bboxes1 = bboxes1 * scale_mat + bboxes2 = bboxes2 * scale_mat bboxes1 = bboxes1.contiguous() bboxes2 = bboxes2.contiguous() ext_module.box_iou_rotated( diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp index c6e6b66478..14df358080 100644 --- a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp @@ -8,6 +8,14 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious, const int mode_flag, const bool aligned) { + if (mode_flag == 0 && aligned == false) { + auto trans = false; + auto is_clockwise = false; + auto need_iou = true; + EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise, + need_iou, ious); + return; + } at::Tensor boxes = at::ones_like(boxes1); at::Tensor query_boxes = at::ones_like(boxes2); boxes = boxes1.transpose(0, 1).unsqueeze(0); From 7c56ba934ece19564f16605d33224daef7bebc67 Mon Sep 17 00:00:00 2001 From: wujiadi Date: Tue, 25 Jun 2024 21:09:34 +0800 Subject: [PATCH 08/33] fix the bug of DeformableRoiPoolGrad --- mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp b/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp index 074e52d4f4..42de978e88 100644 --- a/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp +++ b/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp @@ -53,7 +53,7 @@ void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois, .Output(grad_offset) .Attr("output_size", output_size) .Attr("spatial_scale", spatial_scale) - .Attr("sample_ratio", sampling_ratio_) + .Attr("sampling_ratio", sampling_ratio_) .Attr("gamma", gamma) .Run(); } From f5724756a7aaed9123ca9d660bcb2de296ebae8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=8C=AF=E8=B1=AA?= Date: Fri, 19 Jul 2024 16:09:37 +0800 Subject: [PATCH 09/33] Interfaces change. --- mmcv/ops/scatter_points.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py index 6902343a03..e0aec67ce6 100644 --- a/mmcv/ops/scatter_points.py +++ b/mmcv/ops/scatter_points.py @@ -39,11 +39,11 @@ def forward(ctx: Any, ctx.device = feats.device.type if ctx.device == 'npu': import ads_c - voxel_idx = ads_c.point_to_voxel(coors, None, None) + voxel_idx = ads_c.point_to_voxel(coors, [], []) unique_res = ads_c.unique_voxel(voxel_idx) num_voxels, uniqued_voxel_idx, prefix_sum, \ - argsort_coor = unique_res - voxel_coors = ads_c.voxel_to_point(uniqued_voxel_idx, None, None) + argsort_coor, _ = unique_res + voxel_coors = ads_c.voxel_to_point(uniqued_voxel_idx, [], []) voxel_feats, \ compare_mask = ads_c.npu_dynamic_scatter(feats, coors, prefix_sum, From 6728a0470c95d318d9781ef5b78568d784abbafb Mon Sep 17 00:00:00 2001 From: ZYF-Annarine Date: Mon, 22 Jul 2024 15:58:01 +0800 Subject: [PATCH 10/33] chamfer push_back --- .../csrc/pytorch/npu/chamfer_distance_npu.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp index 170a5fa72a..9345da6dec 100644 --- a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp @@ -6,17 +6,17 @@ using namespace std; void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1, Tensor dist2, Tensor idx1, Tensor idx2) { bool is_half = XYZ1.scalar_type() == at::kHalf; - at::Tensor xyz1 = XYZ1; - at::Tensor xyz2 = XYZ2; - at::Tensor distf1 = dist1; - at::Tensor distf2 = dist2; + at::Tensor xyz1 = at::ones_like(XYZ1); + at::Tensor xyz2 = at::ones_like(XYZ2); + at::Tensor distf1 = at::ones_like(dist1); + at::Tensor distf2 = at::ones_like(dist2); xyz1 = XYZ1.transpose(1, 2).transpose(0, 1); xyz2 = XYZ2.transpose(1, 2).transpose(0, 1); if (is_half) { xyz1 = xyz1.to(at::kFloat); xyz2 = xyz2.to(at::kFloat); - distf1 = distf1.to(at::kFloat); - distf2 = distf2.to(at::kFloat); + distf1 = dist1.to(at::kFloat); + distf2 = dist2.to(at::kFloat); } OpCommand cmd; cmd.Name("ChamferDistance") @@ -31,10 +31,11 @@ void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1, distf1 = distf1.to(at::kHalf); distf2 = distf2.to(at::kHalf); } - dist1 = distf1; - dist2 = distf2; + dist1.copy_(distf1); + dist2.copy_(distf2); } + void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1, Tensor idx2, Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, From 5e0bd899596d47dcee4eeb8d4573398358f9fe5c Mon Sep 17 00:00:00 2001 From: Jing-Yi Li Date: Wed, 24 Jul 2024 14:19:17 +0800 Subject: [PATCH 11/33] adapt boxes_overlap_bev & box_iou_rotated --- mmcv/ops/box_iou_rotated.py | 6 --- .../csrc/pytorch/npu/box_iou_rotated_npu.cpp | 47 +++---------------- .../pytorch/npu/boxes_overlap_bev_npu.cpp | 25 ++++++++++ tests/test_ops/test_iou3d.py | 6 ++- 4 files changed, 37 insertions(+), 47 deletions(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp diff --git a/mmcv/ops/box_iou_rotated.py b/mmcv/ops/box_iou_rotated.py index 0f8a8e298b..8e199d9ac8 100644 --- a/mmcv/ops/box_iou_rotated.py +++ b/mmcv/ops/box_iou_rotated.py @@ -142,12 +142,6 @@ def box_iou_rotated(bboxes1: torch.Tensor, flip_mat[-1] = -1 bboxes1 = bboxes1 * flip_mat bboxes2 = bboxes2 * flip_mat - if bboxes1.device.type == 'npu': - if (mode_flag == 1 or aligned or not clockwise): - scale_mat = bboxes1.new_ones(bboxes1.shape[-1]) - scale_mat[-1] = 1.0 / 0.01745329252 - bboxes1 = bboxes1 * scale_mat - bboxes2 = bboxes2 * scale_mat bboxes1 = bboxes1.contiguous() bboxes2 = bboxes2.contiguous() ext_module.box_iou_rotated( diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp index 14df358080..d8b0bbaa67 100644 --- a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp @@ -8,48 +8,15 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious, const int mode_flag, const bool aligned) { - if (mode_flag == 0 && aligned == false) { - auto trans = false; - auto is_clockwise = false; - auto need_iou = true; - EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise, - need_iou, ious); - return; - } - at::Tensor boxes = at::ones_like(boxes1); - at::Tensor query_boxes = at::ones_like(boxes2); - boxes = boxes1.transpose(0, 1).unsqueeze(0); - query_boxes = boxes2.transpose(0, 1).unsqueeze(0); - bool is_trans = false; - string modeStr = "iou"; - if (mode_flag == 1) { - modeStr = "iof"; - } - bool is_cross = true; - if (aligned) { - is_cross = false; - } - float v_threshold = 0; - float e_threshold = 0; + TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)"); + TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)"); - OpCommand cmd; - cmd.Name("RotatedIou") - .Input(boxes) - .Input(query_boxes) - .Output(ious) - .Attr("trans", is_trans) - .Attr("mode", modeStr) - .Attr("is_cross", is_cross) - .Attr("v_threshold", v_threshold) - .Attr("e_threshold", e_threshold) - .Run(); - - if (is_cross) { - ious = ious.view({boxes1.size(0), boxes2.size(0)}); - } else { - ious = ious.view({boxes1.size(0), 1}); - } + auto trans = false; + auto is_clockwise = false; + EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise, + aligned, mode_flag, ious); + return; } REGISTER_NPU_IMPL(box_iou_rotated_impl, box_iou_rotated_npu); diff --git a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp new file mode 100644 index 0000000000..2a083de2f7 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp @@ -0,0 +1,25 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a, + const int num_b, const Tensor boxes_b, + Tensor ans_overlap); + +void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a, + const int num_b, const Tensor boxes_b, + Tensor ans_overlap) { + + TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)"); + TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)"); + + auto trans = false; + auto is_clockwise = false; + auto aligned = false; + auto mode_flag = 2; + EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, trans, is_clockwise, aligned, mode_flag, ans_overlap); + return; +} + +REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, iou3d_boxes_overlap_bev_forward_npu); diff --git a/tests/test_ops/test_iou3d.py b/tests/test_ops/test_iou3d.py index 27a09eb361..6b2456e8b9 100644 --- a/tests/test_ops/test_iou3d.py +++ b/tests/test_ops/test_iou3d.py @@ -11,7 +11,11 @@ pytest.param( 'cuda', marks=pytest.mark.skipif( - not IS_CUDA_AVAILABLE, reason='requires CUDA support')) + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'npu', + marks=pytest.mark.skipif( + not IS_NPU_AVAILABLE, reason='requires NPU support')) ]) def test_boxes_overlap_bev(device): np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0], From a6d4f0dd7a3a6cee476891d1ede26fc036e66ce6 Mon Sep 17 00:00:00 2001 From: Pr0Wh1teGivee Date: Mon, 5 Aug 2024 11:40:05 +0800 Subject: [PATCH 12/33] fix msda Update chamfer_distance_npu.cpp --- mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp | 1 - mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp | 12 +++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp index 9345da6dec..4f5c32dbec 100644 --- a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp @@ -35,7 +35,6 @@ void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1, dist2.copy_(distf2); } - void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1, Tensor idx2, Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, diff --git a/mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp index 453be7034c..df23543452 100644 --- a/mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/ms_deform_attn_npu.cpp @@ -57,15 +57,9 @@ Tensor ms_deform_attn_forward_npu(const Tensor &value, value.size(0), sampling_locations.size(1), value.size(2) * value.size(3)}; at::Tensor output = at::zeros(output_size, value_fp32.options()); - OpCommand cmd; - cmd.Name("MultiScaleDeformableAttnFunction") - .Input(value_fp32) - .Input(value_spatial_shapes_int32) - .Input(value_level_start_index_int32) - .Input(sampling_locations_fp32) - .Input(attention_weights_fp32) - .Output(output) - .Run(); + EXEC_NPU_CMD(aclnnMultiScaleDeformableAttnFunction, value_fp32, + value_spatial_shapes_int32, value_level_start_index_int32, + sampling_locations_fp32, attention_weights_fp32, output); at::Tensor real_output = output; if (value.scalar_type() != at::kFloat) { From b7702bfd64718e3ba3258ab6ab69c0019e21ecc9 Mon Sep 17 00:00:00 2001 From: momo609 <963372609@qq.com> Date: Wed, 7 Aug 2024 16:40:21 +0800 Subject: [PATCH 13/33] fix focallossgrad --- mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp | 1 + mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp | 16 +++------------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp index 3f3bc5a047..ef7df560c9 100644 --- a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp @@ -194,6 +194,7 @@ void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight, weight_y = at::sum(weight_y, 1, true); weight_y = at::broadcast_to(weight_y, input.sizes()); } + grad_input_y = grad_input_y.fill_(0); OpCommand cmd; string reduction = "none"; cmd.Name("SoftmaxFocalLossGrad") diff --git a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp index 279f14008b..747380fb09 100644 --- a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp @@ -24,12 +24,6 @@ void gather_points_forward_npu(int b, int c, int n, int npoints, void gather_points_backward_npu(int b, int c, int n, int npoints, const Tensor grad_out, const Tensor idx, Tensor grad_points) { - at::Tensor grad_out_cast = grad_out; - at::Tensor grad_points_cast = grad_points; - if (grad_out.scalar_type() == at::ScalarType::Half) { - grad_out_cast = grad_out.to(at::kFloat); - grad_points_cast = grad_points.to(at::kFloat); - } at::Tensor indices = idx; if (idx.scalar_type() != at::ScalarType::Int) { indices = idx.to(at::kInt); @@ -43,11 +37,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints, for (uint64_t i = 0; i < shape.size(); i++) { pad_size.emplace_back(shape[i]); } - at::Tensor trans_grad_points = grad_points_cast.transpose(1, 2).contiguous(); + at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous(); at::Tensor grad_points_view = trans_grad_points.view( {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1], trans_grad_points.sizes()[2]}); - at::Tensor trans_grad_out = grad_out_cast.transpose(1, 2).contiguous(); + at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous(); trans_grad_out = trans_grad_out.view( {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1], trans_grad_out.sizes()[2]}); @@ -69,11 +63,7 @@ void gather_points_backward_npu(int b, int c, int n, int npoints, at::Tensor grad_points_result = grad_points_view.view(trans_grad_points.sizes()); grad_points_result = grad_points_result.transpose(1, 2); - at::Tensor grad_points_result_cast = grad_points_result; - if (grad_out.scalar_type() == at::ScalarType::Half) { - grad_points_result_cast = grad_points_result.to(at::kHalf); - } - grad_points.copy_(grad_points_result_cast); + grad_points.copy_(grad_points_result); } void gather_points_forward_impl(int b, int c, int n, int npoints, From 650c3218f845d13e7819aceb34585b209266ff5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=B8=AE=E6=94=BF?= Date: Thu, 24 Oct 2024 15:49:51 +0800 Subject: [PATCH 14/33] =?UTF-8?q?points=5Fin=5Fboxes=5Fall=E7=9A=84mmcv?= =?UTF-8?q?=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pytorch/npu/points_in_box_npu_all.cpp | 20 +++++++++++++++++++ mmcv/ops/points_in_boxes.py | 8 +++++--- 2 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/points_in_box_npu_all.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/points_in_box_npu_all.cpp b/mmcv/ops/csrc/pytorch/npu/points_in_box_npu_all.cpp new file mode 100644 index 0000000000..acf70d81c2 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/points_in_box_npu_all.cpp @@ -0,0 +1,20 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +void points_in_boxes_all_forward_impl_npu(int batch_size, int boxes_num, + int pts_num, const Tensor boxes, + const Tensor pts, + Tensor box_idx_of_points) { + c10::SmallVector output_size = {pts.size(0), pts.size(1), boxes.size(1)}; + auto boxes_trans = boxes.transpose(1, 2).contiguous(); + EXEC_NPU_CMD(aclnnPointsInBoxAll, boxes_trans, pts, box_idx_of_points); +} + +void points_in_boxes_all_forward_impl(int batch_size, int boxes_num, + int pts_num, const Tensor boxes, + const Tensor pts, + Tensor box_idx_of_points); +REGISTER_NPU_IMPL(points_in_boxes_all_forward_impl, + points_in_boxes_all_forward_impl_npu); diff --git a/mmcv/ops/points_in_boxes.py b/mmcv/ops/points_in_boxes.py index 23c35da4eb..50a085cf46 100644 --- a/mmcv/ops/points_in_boxes.py +++ b/mmcv/ops/points_in_boxes.py @@ -130,9 +130,11 @@ def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor: points_device = points.get_device() assert points_device == boxes.get_device(), \ 'Points and boxes should be put on the same device' - if torch.cuda.current_device() != points_device: - torch.cuda.set_device(points_device) - + if points.device.type != 'npu': + if torch.cuda.current_device() != points_device: + torch.cuda.set_device(points_device) + elif points.device.type == 'npu': + boxes[:, :, 2] += boxes[:, :, 5] / 2.0 ext_module.points_in_boxes_all_forward(boxes.contiguous(), points.contiguous(), box_idxs_of_pts) From 8f5a5b90a4f9fb8add5359e0d2d697dd962da453 Mon Sep 17 00:00:00 2001 From: huangyuan Date: Tue, 29 Oct 2024 09:51:56 +0800 Subject: [PATCH 15/33] change the knn and three nn code --- mmcv/ops/csrc/pytorch/npu/knn_npu.cpp | 2 +- mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp | 3 ++- mmcv/ops/knn.py | 13 +++---------- mmcv/ops/three_nn.py | 6 +++--- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp index c4a1bcbd25..d63977f508 100644 --- a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp @@ -12,7 +12,7 @@ void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz, at::Tensor target = new_xyz.contiguous(); bool is_from_knn = true; - EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2); + EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx); } void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz, diff --git a/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp index 6740a731bc..0a6a10bc81 100644 --- a/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp @@ -11,7 +11,8 @@ void three_nn_forward_npu(int b, int n, int m, const Tensor unknown, at::Tensor target = unknown.contiguous(); bool is_from_knn = false; - EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2); + int nsample = 3; + EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx); } void three_nn_forward_impl(int b, int n, int m, const Tensor unknown, diff --git a/mmcv/ops/knn.py b/mmcv/ops/knn.py index 1e2a68d1d2..d56f74f2c9 100644 --- a/mmcv/ops/knn.py +++ b/mmcv/ops/knn.py @@ -63,17 +63,10 @@ def forward(ctx, N = xyz.shape[1] if xyz.device.type == 'npu': - dist = center_xyz.new_zeros((B, npoint, N)).float() + dist2 = center_xyz.new_zeros((B, npoint, k)).float() + idx = center_xyz.new_zeros((B, npoint, k)).int() ext_module.knn_forward( - xyz, - center_xyz, - torch.Tensor([]).npu(), - dist, - b=B, - n=N, - m=npoint, - nsample=k) - dist2, idx = torch.topk(dist, k, dim=2, largest=False, sorted=True) + xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k) zeros_idx = torch.zeros( xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu() idx.where(dist2 >= 1e10, zeros_idx) diff --git a/mmcv/ops/three_nn.py b/mmcv/ops/three_nn.py index 52d504609a..db8bbdef11 100644 --- a/mmcv/ops/three_nn.py +++ b/mmcv/ops/three_nn.py @@ -41,10 +41,10 @@ def forward(ctx: Any, target: torch.Tensor, if dtype_ == torch.float16: target = target.float() source = source.float() - dist = target.new_empty(B, N, m) + dist2 = target.new_empty(B, N, 3) + idx = target.new_empty(B, N, 3, dtype=torch.int32) ext_module.three_nn_forward( - target, source, dist, torch.Tensor([]).npu(), b=B, n=N, m=m) - dist2, idx = torch.topk(dist, 3, dim=2, largest=False, sorted=True) + target, source, dist2, idx, b=B, n=N, m=m) dist2 = torch.sqrt(dist2) if dtype_ == torch.float16: dist2 = dist2.half() From 993979b07990cbd3b6c7e8e511366499ba415207 Mon Sep 17 00:00:00 2001 From: Jing-Yi Li Date: Wed, 30 Oct 2024 09:28:44 +0800 Subject: [PATCH 16/33] codeclean npu/boxes_overlap_bev_npu.cpp --- mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp index 2a083de2f7..6bc6273083 100644 --- a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp @@ -6,7 +6,7 @@ using namespace std; void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a, const int num_b, const Tensor boxes_b, Tensor ans_overlap); - + void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a, const int num_b, const Tensor boxes_b, Tensor ans_overlap) { From 94d6e92853563b643a5e4f6e911b006eedd8a0c2 Mon Sep 17 00:00:00 2001 From: Jing-Yi Li Date: Tue, 24 Sep 2024 13:01:49 +0800 Subject: [PATCH 17/33] adapt npu box_iou_quadri --- .../csrc/pytorch/npu/box_iou_quadri_npu.cpp | 19 +++++++++++++++++++ tests/test_ops/test_box_iou_quadri.py | 10 +++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp new file mode 100644 index 0000000000..6baf44f448 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp @@ -0,0 +1,19 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious, + const int mode_flag, const bool aligned); + +void box_iou_quadri_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious, + const int mode_flag, const bool aligned) { + + TORCH_CHECK(boxes1.size(1) == 8, "boxes1 must be 2D tensor (N, 8)"); + TORCH_CHECK(boxes1.size(1) == 8, "boxes1 must be 2D tensor (N, 8)"); + + EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious); + return; +} + +REGISTER_NPU_IMPL(box_iou_quadri_impl, box_iou_quadri_npu); diff --git a/tests/test_ops/test_box_iou_quadri.py b/tests/test_ops/test_box_iou_quadri.py index e5cfcab61b..006f04c5c5 100644 --- a/tests/test_ops/test_box_iou_quadri.py +++ b/tests/test_ops/test_box_iou_quadri.py @@ -3,7 +3,7 @@ import pytest import torch -from mmcv.utils import IS_CUDA_AVAILABLE +from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE class TestBoxIoUQuadri: @@ -14,6 +14,10 @@ class TestBoxIoUQuadri: 'cuda', marks=pytest.mark.skipif( not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'npu', + marks=pytest.mark.skipif( + not IS_NPU_AVAILABLE, reason='requires NPU support')) ]) def test_box_iou_quadri_cuda(self, device): from mmcv.ops import box_iou_quadri @@ -48,6 +52,10 @@ def test_box_iou_quadri_cuda(self, device): 'cuda', marks=pytest.mark.skipif( not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'npu', + marks=pytest.mark.skipif( + not IS_NPU_AVAILABLE, reason='requires NPU support')) ]) def test_box_iou_quadri_iof_cuda(self, device): from mmcv.ops import box_iou_quadri From ae66b320f883871f33fcbba2a7493f44116db6e5 Mon Sep 17 00:00:00 2001 From: huangyuan Date: Fri, 1 Nov 2024 17:18:37 +0800 Subject: [PATCH 18/33] add the roi align rotated v2 ops --- mmcv/ops/__init__.py | 36 ++-- .../pytorch/npu/roi_align_rotated_v2_npu.cpp | 52 ++++++ mmcv/ops/csrc/pytorch/pybind.cpp | 19 ++ .../ops/csrc/pytorch/roi_align_rotated_v2.cpp | 37 ++++ mmcv/ops/roi_align_rotated_v2.py | 166 ++++++++++++++++++ 5 files changed, 293 insertions(+), 17 deletions(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp create mode 100644 mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp create mode 100644 mmcv/ops/roi_align_rotated_v2.py diff --git a/mmcv/ops/__init__.py b/mmcv/ops/__init__.py index ffad9b2bfd..132e010c74 100755 --- a/mmcv/ops/__init__.py +++ b/mmcv/ops/__init__.py @@ -56,6 +56,7 @@ from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated from .roi_align import RoIAlign, roi_align from .roi_align_rotated import RoIAlignRotated, roi_align_rotated +from .roi_align_rotated_v2 import RoIAlignRotatedV2, roi_align_rotated_v2 from .roi_pool import RoIPool, roi_pool from .roiaware_pool3d import RoIAwarePool3d from .roipoint_pool3d import RoIPointPool3d @@ -92,23 +93,24 @@ 'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated', - 'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation', - 'contour_expand', 'three_nn', 'three_interpolate', - 'MultiScaleDeformableAttention', 'BorderAlign', 'border_align', - 'gather_points', 'furthest_point_sample', 'nms_quadri', - 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', - 'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev', - 'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization', - 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d', - 'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d', - 'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d', - 'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d', - 'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part', - 'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons', - 'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou', - 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance', - 'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d', - 'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align' + 'RoIAlignRotatedV2', 'roi_align_rotated_v2', 'pixel_group', + 'QueryAndGroup', 'GroupAll', 'grouping_operation', 'contour_expand', + 'three_nn', 'three_interpolate', 'MultiScaleDeformableAttention', + 'BorderAlign', 'border_align', 'gather_points', 'furthest_point_sample', + 'nms_quadri', 'furthest_point_sample_with_dist', 'PointsSampler', + 'Correlation', 'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', + 'nms_bev', 'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', + 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', + 'SparseConv2d', 'SparseConv3d', 'SparseConvTranspose2d', + 'SparseConvTranspose3d', 'SparseInverseConv2d', 'SparseInverseConv3d', + 'SubMConv2d', 'SubMConv3d', 'SparseModule', 'SparseSequential', + 'SparseMaxPool2d', 'SparseMaxPool3d', 'SparseConvTensor', 'scatter_nd', + 'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all', + 'points_in_polygons', 'min_area_polygons', 'active_rotated_filter', + 'convex_iou', 'convex_giou', 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', + 'chamfer_distance', 'PrRoIPool', 'prroi_pool', 'bias_act', + 'filtered_lrelu', 'conv2d', 'conv_transpose2d', 'filter2d', 'upsample2d', + 'BezierAlign', 'bezier_align' ] if IS_MLU_AVAILABLE: diff --git a/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp b/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp new file mode 100644 index 0000000000..b2ea93b261 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp @@ -0,0 +1,52 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +void roi_align_rotated_v2_forward_npu(const Tensor input, Tensor rois_map, + Tensor output, + double spatial_scale, + int32_t sampling_ratio, + int32_t pooled_height, + int32_t pooled_width, + bool aligned, + bool clockwise) { + at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous(); + at::Tensor rois = rois_map.permute({1, 0}).contiguous(); + EXEC_NPU_CMD(aclnnRoiAlignRotatedV2, feature_map, rois, spatial_scale, sampling_ratio, pooled_height, pooled_width, aligned, clockwise, output); +} + +void roi_align_rotated_v2_forward_impl(const Tensor input, Tensor rois, + Tensor output, + double spatial_scale, + int32_t sampling_ratio, + int32_t pooled_height, + int32_t pooled_width, + bool aligned, + bool clockwise); + +REGISTER_NPU_IMPL(roi_align_rotated_v2_forward_impl, roi_align_rotated_v2_forward_npu); + +void roi_align_rotated_v2_backward_npu(const Tensor input, Tensor rois, + Tensor grad_output, Tensor grad_input, + int32_t pooled_height, + int32_t pooled_width, + double spatial_scale, + int32_t sampling_ratio, + bool aligned, + bool clockwise) { + EXEC_NPU_CMD(aclnnRoiAlignRotatedGradV2, input, rois, grad_output, + pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned, clockwise, + grad_input); +} + +void roi_align_rotated_v2_backward_impl(const Tensor input, Tensor rois, + Tensor grad_output, Tensor grad_input, + int32_t pooled_height, + int32_t pooled_width, + double spatial_scale, + int32_t sampling_ratio, + bool aligned, + bool clockwise); + +REGISTER_NPU_IMPL(roi_align_rotated_v2_backward_impl, roi_align_rotated_v2_backward_npu); diff --git a/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/ops/csrc/pytorch/pybind.cpp index c8591a5cc1..5d73d08adf 100644 --- a/mmcv/ops/csrc/pytorch/pybind.cpp +++ b/mmcv/ops/csrc/pytorch/pybind.cpp @@ -331,6 +331,15 @@ void roi_align_rotated_backward(Tensor grad_output, Tensor rois, int sampling_ratio, bool aligned, bool clockwise); +void roi_align_rotated_v2_forward(Tensor input, Tensor rois, Tensor output, + double spatial_scale, int sampling_ratio, + int aligned_height, int aligned_width, + bool aligned, bool clockwise); + +void roi_align_rotated_v2_backward(Tensor input, Tensor rois, Tensor grad_output, Tensor grad_input, + int pooled_height, int pooled_width, double spatial_scale, + int sampling_ratio, bool aligned, bool clockwise); + std::vector dynamic_point_to_voxel_forward( const torch::Tensor &feats, const torch::Tensor &coors, const std::string &reduce_type); @@ -792,6 +801,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { py::arg("grad_output"), py::arg("pooled_height"), py::arg("pooled_width"), py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise")); + m.def("roi_align_rotated_v2_forward", &roi_align_rotated_v2_forward, + "roi_align_rotated_v2_forward", py::arg("input"), py::arg("rois"), + py::arg("output"), py::arg("spatial_scale"), py::arg("sampling_ratio"), + py::arg("pooled_height"), py::arg("pooled_width"), + py::arg("aligned"), py::arg("clockwise")); + m.def("roi_align_rotated_v2_backward", &roi_align_rotated_v2_backward, + "roi_align_rotated_v2_backward", py::arg("input"), py::arg("rois"), + py::arg("grad_output"), py::arg("grad_input"), py::arg("pooled_height"), + py::arg("pooled_width"), py::arg("spatial_scale"), py::arg("sampling_ratio"), + py::arg("aligned"), py::arg("clockwise")); m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"), py::arg("reduce_type")); diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp b/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp new file mode 100644 index 0000000000..7743775288 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp @@ -0,0 +1,37 @@ +// Copyright (c) OpenMMLab. All rights reserved +#include "pytorch_cpp_helper.hpp" +#include "pytorch_device_registry.hpp" + +void roi_align_rotated_v2_forward_impl(Tensor input, Tensor rois, Tensor output, + double spatial_scale, int sampling_ratio, + int pooled_height, int pooled_width, + bool aligned, bool clockwise) { + DISPATCH_DEVICE_IMPL(roi_align_rotated_v2_forward_impl, input, rois, output, + spatial_scale, sampling_ratio, pooled_height, pooled_width, + aligned, clockwise); +} + + +void roi_align_rotated_v2_forward(Tensor input, Tensor rois, Tensor output, + double spatial_scale, int sampling_ratio, + int pooled_height, int pooled_width, + bool aligned, bool clockwise) { + roi_align_rotated_v2_forward_impl(input, rois, output, spatial_scale, sampling_ratio, + pooled_height, pooled_width, aligned, clockwise); +} + + +void roi_align_rotated_v2_backward_impl(Tensor input, Tensor rois, Tensor grad_output, Tensor grad_input, + int pooled_height, int pooled_width, double spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + DISPATCH_DEVICE_IMPL(roi_align_rotated_v2_backward_impl, input, rois, grad_output, grad_input, + pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned, clockwise); +} + + +void roi_align_rotated_v2_backward(Tensor input, Tensor rois, Tensor grad_output, Tensor grad_input, + int pooled_height, int pooled_width, double spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + roi_align_rotated_v2_backward_impl(input, rois, grad_output, grad_input, + pooled_height, pooled_width, spatial_scale, sampling_ratio, aligned, clockwise); +} diff --git a/mmcv/ops/roi_align_rotated_v2.py b/mmcv/ops/roi_align_rotated_v2.py new file mode 100644 index 0000000000..639fea3a23 --- /dev/null +++ b/mmcv/ops/roi_align_rotated_v2.py @@ -0,0 +1,166 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + +import torch +import torch.nn as nn +from mmengine.utils import deprecated_api_warning +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['roi_align_rotated_v2_forward']) + + +class RoIAlignRotatedV2Function(Function): + + @staticmethod + def symbolic(g, input, rois, spatial_scale, sampling_ratio, pooled_height, + pooled_width, aligned, clockwise): + return g.op( + 'mmcv::MMCVRoIAlignRotatedV2', + input, + rois, + spatial_scale_f=spatial_scale, + sampling_ratio_i=sampling_ratio, + pooled_height=pooled_height, + pooled_width=pooled_width, + aligned_i=aligned, + clockwise_i=clockwise) + + @staticmethod + def forward(ctx: Any, + input: torch.Tensor, + rois: torch.Tensor, + spatial_scale: float, + sampling_ratio: int, + pooled_height: int, + pooled_width: int, + aligned: bool = True, + clockwise: bool = False) -> torch.Tensor: + ctx.pooled_height = pooled_height + ctx.pooled_width = pooled_width + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + ctx.aligned = aligned + ctx.clockwise = clockwise + ctx.save_for_backward(input, rois) + ctx.feature_size = input.size() + batch_size, num_channels, data_height, data_width = input.size() + num_rois = rois.size(0) + + output = input.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, + num_channels) + + ext_module.roi_align_rotated_v2_forward( + input, + rois, + output, + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + pooled_height=ctx.pooled_height, + pooled_width=ctx.pooled_width, + aligned=ctx.aligned, + clockwise=ctx.clockwise) + output = output.transpose(2, 3).transpose(1, 2).contiguous() + return output + + @staticmethod + def backward(ctx: Any, grad_output: torch.Tensor): + input, rois = ctx.saved_tensors + rois_trans = torch.permute(rois, (1, 0)).contiguous() + grad_output_trans = torch.permute(grad_output, + (0, 2, 3, 1)).contiguous() + grad_input = input.new_zeros( + input.size(0), input.size(2), input.size(3), input.size(1)) + ext_module.roi_align_rotated_v2_backward( + input, rois_trans, grad_output_trans, grad_input, + ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, + ctx.sampling_ratio, ctx.aligned, ctx.clockwise) + grad_input = grad_input.permute(0, 3, 1, 2).contiguous() + + return grad_input, None, None, None, None, None, None, None + + +roi_align_rotated_v2 = RoIAlignRotatedV2Function.apply + + +class RoIAlignRotatedV2(nn.Module): + """RoI align pooling layer for rotated proposals. + + It accepts a feature map of shape (N, C, H, W) and rois with shape + (n, 6) with each roi decoded as (batch_index, center_x, center_y, + w, h, angle). The angle is in radian. + + Args: + output_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sampling_ratio(int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + Default: True. + clockwise (bool): If True, the angle in each proposal follows a + clockwise fashion in image space, otherwise, the angle is + counterclockwise. Default: False. + + Note: + The implementation of RoIAlign when aligned=True is modified from + https://github.com/facebookresearch/detectron2/ + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ + + @deprecated_api_warning( + { + 'out_size': 'output_size', + 'sample_num': 'sampling_ratio' + }, + cls_name='RoIAlignRotatedV2') + def __init__(self, + spatial_scale: float, + sampling_ratio: int, + pooled_height: int, + pooled_width: int, + aligned: bool = True, + clockwise: bool = False): + super().__init__() + + self.pooled_height = int(pooled_height) + self.pooled_width = int(pooled_width) + self.spatial_scale = float(spatial_scale) + self.sampling_ratio = int(sampling_ratio) + self.aligned = aligned + self.clockwise = clockwise + + def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: + return RoIAlignRotatedV2Function.apply(input, rois, self.spatial_scale, + self.sampling_ratio, + self.pooled_height, + self.pooled_width, self.aligned, + self.clockwise) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(pooled_height={self.pooled_height}, ' + s += f'spatial_scale={self.spatial_scale}, ' + s += f'sampling_ratio={self.sampling_ratio}, ' + s += f'aligned={self.aligned}, ' + s += f'clockwise={self.clockwise})' + return s From ffe3a1a48610bcf56aa1405b85c7f219d6553f46 Mon Sep 17 00:00:00 2001 From: Hua-yuxiu Date: Mon, 4 Nov 2024 10:20:19 +0800 Subject: [PATCH 19/33] add assign_score_withk NPU adaptation --- .../pytorch/npu/assign_score_withk_npu.cpp | 23 ++ tests/test_ops/test_assign_score_withk.py | 328 +++++++++--------- 2 files changed, 194 insertions(+), 157 deletions(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp b/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp new file mode 100644 index 0000000000..8945974a6b --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp @@ -0,0 +1,23 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +void assign_score_withk_forward_npu(int B, int N0, int N1, int M, int K, int O, + int aggregate, const Tensor& points, + const Tensor& centers, + const Tensor& scores, + const Tensor& knn_idx, Tensor& output) { + at::Tensor points_trans = points.permute({0, 3, 1, 2}); + at::Tensor centers_trans = centers.permute({0, 3, 1, 2}); + + EXEC_NPU_CMD(aclnnAssignScoreWithk, points_trans, centers_trans, scores, knn_idx, B, N0, N1, M, K, O, aggregate, output); +} + +void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O, + int aggregate, const Tensor& points, + const Tensor& centers, + const Tensor& scores, + const Tensor& knn_idx, Tensor& output); + +REGISTER_NPU_IMPL(assign_score_withk_forward_impl, assign_score_withk_forward_npu); diff --git a/tests/test_ops/test_assign_score_withk.py b/tests/test_ops/test_assign_score_withk.py index f8fc6ae626..bd27144545 100644 --- a/tests/test_ops/test_assign_score_withk.py +++ b/tests/test_ops/test_assign_score_withk.py @@ -3,93 +3,105 @@ import torch from mmcv.ops import assign_score_withk +from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE -@pytest.mark.skipif( - not torch.cuda.is_available(), reason='requires CUDA support') -def test_paconv_assign_scores(): - scores = torch.tensor([[[[0.06947571, 0.6065746], [0.28462553, 0.8378516], - [0.7595994, 0.97220325], [0.519155, 0.766185]], - [[0.15348864, 0.6051019], [0.21510637, 0.31916398], - [0.00236845, 0.5842595], [0.6783676, 0.5216348]]], - [[[0.23089725, 0.5568468], [0.7405102, 0.06438422], - [0.6887394, 0.22089851], [0.0502342, 0.79228795]], - [[0.44883424, 0.15427643], - [0.13817799, 0.34856772], [0.7989621, 0.33788306], - [0.15699774, 0.7693662]]]]).float().cuda() - scores.requires_grad_() - points = torch.tensor([[[[0.06001121, 0.92963666, 0.5753327, 0.7251477], - [0.53563064, 0.23129565, 0.92366195, 0.44261628]], - [[0.5770022, 0.56625944, 0.23560429, 0.11178821], - [0.7735967, 0.95678777, 0.25468266, 0.02895975]], - [[0.0589869, 0.09017515, 0.5977862, 0.02797985], - [0.603862, 0.35991007, 0.85761684, 0.3096559]], - [[0.22359002, 0.13983732, 0.5544243, 0.68863827], - [0.85646236, 0.75651926, 0.8638947, 0.83600986]], - [[0.45424145, 0.27458847, 0.6456112, 0.47162914], - [0.15773582, 0.47645122, 0.79964715, 0.3323908]], - [[0.8351399, 0.84696376, 0.9431732, 0.29418713], - [0.77168906, 0.6996871, 0.19354361, 0.03392768]], - [[0.30976456, 0.7074133, 0.581795, 0.976677], - [0.69656056, 0.07199162, 0.4708506, 0.29117996]], - [[0.5829035, 0.30201727, 0.76556486, 0.0935446], - [0.88030535, 0.16129416, 0.9242525, 0.49545723]]], - [[[0.50899494, 0.06482804, 0.44939405, 0.37704808], - [0.47028124, 0.11969638, 0.62823206, 0.28560323]], - [[0.40690207, 0.689753, 0.51636654, 0.23040164], - [0.06935787, 0.00488842, 0.22462702, 0.09182382]], - [[0.26611632, 0.00184339, 0.7730655, 0.5228131], - [0.87776035, 0.77895886, 0.2787183, 0.16620636]], - [[0.502574, 0.04039001, 0.5368497, 0.98379374], - [0.40973026, 0.3238272, 0.9733018, 0.13988364]], - [[0.04586202, 0.20983845, 0.20662665, 0.22270602], - [0.60387236, 0.5155574, 0.51237285, 0.6528438]], - [[0.45735973, 0.86821306, 0.61054605, 0.8370336], - [0.45193362, 0.3734138, 0.7825672, 0.5699416]], - [[0.44591594, 0.12447512, 0.09282011, 0.7055254], - [0.25223452, 0.46696228, 0.7051136, 0.892151]], - [[0.49615085, 0.47321403, 0.93138885, 0.7652197], - [0.38766378, 0.30332977, 0.23131835, - 0.02863514]]]]).float().cuda() - points.requires_grad_() - centers = torch.tensor([[[[0.83878064, 0.96658987, 0.8033424, 0.9598312], - [0.45035273, 0.8768925, 0.977736, 0.54547966]], - [[0.01041394, 0.597893, 0.36212963, 0.4410367], - [0.94879234, 0.8372817, 0.21237361, 0.67945415]], - [[0.5096087, 0.26401454, 0.60034937, 0.5417416], - [0.87591463, 0.546456, 0.4096033, 0.16373193]], - [[0.79547447, 0.1482386, 0.12840575, 0.45384115], - [0.5640288, 0.944541, 0.5745328, 0.73229736]], - [[0.93011934, 0.7406011, 0.62621707, 0.8677915], - [0.91563636, 0.3595413, 0.6678378, 0.6085383]], - [[0.22431666, 0.65617776, 0.7483924, 0.6263364], - [0.30968404, 0.78204364, 0.14899081, - 0.09628749]], - [[0.73675203, 0.72104895, 0.4648038, 0.6101647], - [0.7817645, 0.16572917, 0.3311919, 0.43407398]], - [[0.8193154, 0.09559608, 0.05978829, 0.90262103], - [0.4256065, 0.8165596, 0.8206446, 0.6604721]]], - [[[0.7159653, 0.18600845, 0.21433902, 0.3159626], - [0.3921569, 0.33221376, 0.5061177, 0.7961841]], - [[0.95338356, 0.04785997, 0.67185795, 0.6538394], - [0.4729132, 0.33404195, 0.17750603, 0.8445621]], - [[0.6755793, 0.16193843, 0.75943846, 0.92123103], - [0.2781859, 0.03114432, 0.710638, 0.52729136]], - [[0.8376105, 0.10858494, 0.13208169, 0.365772], - [0.5930795, 0.27390373, 0.14036089, 0.170403]], - [[0.3479789, 0.89855295, 0.04844379, 0.9871029], - [0.29781651, 0.0244137, 0.9179047, 0.8081611]], - [[0.12460887, 0.44991326, 0.19382608, 0.35037738], - [0.2773472, 0.4362057, 0.36757517, 0.5993509]], - [[0.29630446, 0.90046406, 0.5417113, 0.13510644], - [0.09623539, 0.04226565, 0.32001644, - 0.44358212]], - [[0.5274848, 0.82096446, 0.9415489, 0.7123748], - [0.7537517, 0.8086482, 0.85345286, - 0.7472754]]]]).float().cuda() - centers.requires_grad_() - knn_idx = torch.tensor([[[6, 7, 4, 6], [2, 4, 2, 4]], - [[7, 1, 3, 2], [6, 0, 2, 6]]]).long().cuda() +@pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'npu', + marks=pytest.mark.skipif( + not IS_NPU_AVAILABLE, reason='requires NPU support')) +]) +def test_paconv_assign_scores(device): + scores = torch.tensor( + [[[[0.06947571, 0.6065746], [0.28462553, 0.8378516], + [0.7595994, 0.97220325], [0.519155, 0.766185]], + [[0.15348864, 0.6051019], [0.21510637, 0.31916398], + [0.00236845, 0.5842595], [0.6783676, 0.5216348]]], + [[[0.23089725, 0.5568468], [0.7405102, 0.06438422], + [0.6887394, 0.22089851], [0.0502342, 0.79228795]], + [[0.44883424, 0.15427643], [0.13817799, 0.34856772], + [0.7989621, 0.33788306], [0.15699774, 0.7693662]]]], + device=device).float() + points = torch.tensor( + [[[[0.06001121, 0.92963666, 0.5753327, 0.7251477], + [0.53563064, 0.23129565, 0.92366195, 0.44261628]], + [[0.5770022, 0.56625944, 0.23560429, 0.11178821], + [0.7735967, 0.95678777, 0.25468266, 0.02895975]], + [[0.0589869, 0.09017515, 0.5977862, 0.02797985], + [0.603862, 0.35991007, 0.85761684, 0.3096559]], + [[0.22359002, 0.13983732, 0.5544243, 0.68863827], + [0.85646236, 0.75651926, 0.8638947, 0.83600986]], + [[0.45424145, 0.27458847, 0.6456112, 0.47162914], + [0.15773582, 0.47645122, 0.79964715, 0.3323908]], + [[0.8351399, 0.84696376, 0.9431732, 0.29418713], + [0.77168906, 0.6996871, 0.19354361, 0.03392768]], + [[0.30976456, 0.7074133, 0.581795, 0.976677], + [0.69656056, 0.07199162, 0.4708506, 0.29117996]], + [[0.5829035, 0.30201727, 0.76556486, 0.0935446], + [0.88030535, 0.16129416, 0.9242525, 0.49545723]]], + [[[0.50899494, 0.06482804, 0.44939405, 0.37704808], + [0.47028124, 0.11969638, 0.62823206, 0.28560323]], + [[0.40690207, 0.689753, 0.51636654, 0.23040164], + [0.06935787, 0.00488842, 0.22462702, 0.09182382]], + [[0.26611632, 0.00184339, 0.7730655, 0.5228131], + [0.87776035, 0.77895886, 0.2787183, 0.16620636]], + [[0.502574, 0.04039001, 0.5368497, 0.98379374], + [0.40973026, 0.3238272, 0.9733018, 0.13988364]], + [[0.04586202, 0.20983845, 0.20662665, 0.22270602], + [0.60387236, 0.5155574, 0.51237285, 0.6528438]], + [[0.45735973, 0.86821306, 0.61054605, 0.8370336], + [0.45193362, 0.3734138, 0.7825672, 0.5699416]], + [[0.44591594, 0.12447512, 0.09282011, 0.7055254], + [0.25223452, 0.46696228, 0.7051136, 0.892151]], + [[0.49615085, 0.47321403, 0.93138885, 0.7652197], + [0.38766378, 0.30332977, 0.23131835, 0.02863514]]]], + device=device).float() + centers = torch.tensor( + [[[[0.83878064, 0.96658987, 0.8033424, 0.9598312], + [0.45035273, 0.8768925, 0.977736, 0.54547966]], + [[0.01041394, 0.597893, 0.36212963, 0.4410367], + [0.94879234, 0.8372817, 0.21237361, 0.67945415]], + [[0.5096087, 0.26401454, 0.60034937, 0.5417416], + [0.87591463, 0.546456, 0.4096033, 0.16373193]], + [[0.79547447, 0.1482386, 0.12840575, 0.45384115], + [0.5640288, 0.944541, 0.5745328, 0.73229736]], + [[0.93011934, 0.7406011, 0.62621707, 0.8677915], + [0.91563636, 0.3595413, 0.6678378, 0.6085383]], + [[0.22431666, 0.65617776, 0.7483924, 0.6263364], + [0.30968404, 0.78204364, 0.14899081, 0.09628749]], + [[0.73675203, 0.72104895, 0.4648038, 0.6101647], + [0.7817645, 0.16572917, 0.3311919, 0.43407398]], + [[0.8193154, 0.09559608, 0.05978829, 0.90262103], + [0.4256065, 0.8165596, 0.8206446, 0.6604721]]], + [[[0.7159653, 0.18600845, 0.21433902, 0.3159626], + [0.3921569, 0.33221376, 0.5061177, 0.7961841]], + [[0.95338356, 0.04785997, 0.67185795, 0.6538394], + [0.4729132, 0.33404195, 0.17750603, 0.8445621]], + [[0.6755793, 0.16193843, 0.75943846, 0.92123103], + [0.2781859, 0.03114432, 0.710638, 0.52729136]], + [[0.8376105, 0.10858494, 0.13208169, 0.365772], + [0.5930795, 0.27390373, 0.14036089, 0.170403]], + [[0.3479789, 0.89855295, 0.04844379, 0.9871029], + [0.29781651, 0.0244137, 0.9179047, 0.8081611]], + [[0.12460887, 0.44991326, 0.19382608, 0.35037738], + [0.2773472, 0.4362057, 0.36757517, 0.5993509]], + [[0.29630446, 0.90046406, 0.5417113, 0.13510644], + [0.09623539, 0.04226565, 0.32001644, 0.44358212]], + [[0.5274848, 0.82096446, 0.9415489, 0.7123748], + [0.7537517, 0.8086482, 0.85345286, 0.7472754]]]], + device=device).float() + if device == 'cuda': + points.requires_grad_() + scores.requires_grad_() + centers.requires_grad_() + knn_idx = torch.tensor( + [[[6, 7, 4, 6], [2, 4, 2, 4]], [[7, 1, 3, 2], [6, 0, 2, 6]]], + device=device).long() aggregate = 'sum' expected_output = torch.tensor( [[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547], @@ -114,75 +126,77 @@ def test_paconv_assign_scores(): assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6) # test backward - loss = output.sum() - loss.backward() - expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683], - [-0.78873926, 0.7485497], - [-0.6866992, 0.05346543], - [0.04288036, -0.18217683]], - [[-1.1407862, 0.13533896], - [-0.06964391, -0.22948086], - [-1.1407862, 0.13533896], - [-0.06964391, -0.22948086]]], - [[[-0.3363995, -2.212181], - [-1.1589496, -2.7724311], - [-0.9387654, -1.3163853], - [-1.4385346, -1.0614843]], - [[-0.5048497, 1.4143617], - [-0.47332114, 0.6017133], - [-0.30974793, 1.1995442], - [-0.5048497, 1.4143617]]]]).float() - expected_points_grad = torch.tensor( - [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0.15585709, 0.15585709, 0.15585709, 0.15585709], - [1.1893613, 1.1893613, 1.1893613, 1.1893613]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[1.6530733, 1.6530733, 1.6530733, 1.6530733], - [1.8130021, 1.8130021, 1.8130021, 1.8130021]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0.58863074, 0.58863074, 0.58863074, 0.58863074], - [1.3727596, 1.3727596, 1.3727596, 1.3727596]], - [[0.28462553, 0.28462553, 0.28462553, 0.28462553], - [0.8378516, 0.8378516, 0.8378516, 0.8378516]]], - [[[0.13817799, 0.13817799, 0.13817799, 0.13817799], - [0.34856772, 0.34856772, 0.34856772, 0.34856772]], - [[0.7405102, 0.7405102, 0.7405102, 0.7405102], - [0.06438422, 0.06438422, 0.06438422, 0.06438422]], - [[0.8491963, 0.8491963, 0.8491963, 0.8491963], - [1.1301711, 1.1301711, 1.1301711, 1.1301711]], - [[0.6887394, 0.6887394, 0.6887394, 0.6887394], - [0.22089851, 0.22089851, 0.22089851, 0.22089851]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0.605832, 0.605832, 0.605832, 0.605832], - [0.92364264, 0.92364264, 0.92364264, 0.92364264]], - [[0.23089725, 0.23089725, 0.23089725, 0.23089725], - [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float() - expected_centers_grad = torch.tensor( - [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[-1.0493311, -1.0493311, -1.0493311, -1.0493311], - [-2.0301602, -2.0301602, -2.0301602, -2.0301602]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[-1.6328557, -1.6328557, -1.6328557, -1.6328557], - [-3.1828144, -3.1828144, -3.1828144, -3.1828144]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]]], - [[[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[-1.5429721, -1.5429721, -1.5429721, -1.5429721], - [-1.6100934, -1.6100934, -1.6100934, -1.6100934]], - [[-1.7103812, -1.7103812, -1.7103812, -1.7103812], - [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float() - assert torch.allclose( - scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6) - assert torch.allclose( - points.grad.detach().cpu(), expected_points_grad, atol=1e-6) - assert torch.allclose( - centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6) + if device == 'cuda': + loss = output.sum() + loss.backward() + expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683], + [-0.78873926, 0.7485497], + [-0.6866992, 0.05346543], + [0.04288036, -0.18217683]], + [[-1.1407862, 0.13533896], + [-0.06964391, -0.22948086], + [-1.1407862, 0.13533896], + [-0.06964391, -0.22948086]]], + [[[-0.3363995, -2.212181], + [-1.1589496, -2.7724311], + [-0.9387654, -1.3163853], + [-1.4385346, -1.0614843]], + [[-0.5048497, 1.4143617], + [-0.47332114, 0.6017133], + [-0.30974793, 1.1995442], + [-0.5048497, + 1.4143617]]]]).float() + expected_points_grad = torch.tensor( + [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0.15585709, 0.15585709, 0.15585709, 0.15585709], + [1.1893613, 1.1893613, 1.1893613, 1.1893613]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[1.6530733, 1.6530733, 1.6530733, 1.6530733], + [1.8130021, 1.8130021, 1.8130021, 1.8130021]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0.58863074, 0.58863074, 0.58863074, 0.58863074], + [1.3727596, 1.3727596, 1.3727596, 1.3727596]], + [[0.28462553, 0.28462553, 0.28462553, 0.28462553], + [0.8378516, 0.8378516, 0.8378516, 0.8378516]]], + [[[0.13817799, 0.13817799, 0.13817799, 0.13817799], + [0.34856772, 0.34856772, 0.34856772, 0.34856772]], + [[0.7405102, 0.7405102, 0.7405102, 0.7405102], + [0.06438422, 0.06438422, 0.06438422, 0.06438422]], + [[0.8491963, 0.8491963, 0.8491963, 0.8491963], + [1.1301711, 1.1301711, 1.1301711, 1.1301711]], + [[0.6887394, 0.6887394, 0.6887394, 0.6887394], + [0.22089851, 0.22089851, 0.22089851, 0.22089851]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0.605832, 0.605832, 0.605832, 0.605832], + [0.92364264, 0.92364264, 0.92364264, 0.92364264]], + [[0.23089725, 0.23089725, 0.23089725, 0.23089725], + [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float() + expected_centers_grad = torch.tensor( + [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[-1.0493311, -1.0493311, -1.0493311, -1.0493311], + [-2.0301602, -2.0301602, -2.0301602, -2.0301602]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[-1.6328557, -1.6328557, -1.6328557, -1.6328557], + [-3.1828144, -3.1828144, -3.1828144, -3.1828144]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]]], + [[[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[-1.5429721, -1.5429721, -1.5429721, -1.5429721], + [-1.6100934, -1.6100934, -1.6100934, -1.6100934]], + [[-1.7103812, -1.7103812, -1.7103812, -1.7103812], + [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float() + assert torch.allclose( + scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6) + assert torch.allclose( + points.grad.detach().cpu(), expected_points_grad, atol=1e-6) + assert torch.allclose( + centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6) From 42d93d48a8356375e80cc7749409f7a2c962d1eb Mon Sep 17 00:00:00 2001 From: Jing-Yi Li Date: Mon, 4 Nov 2024 11:57:52 +0800 Subject: [PATCH 20/33] update point_to_voxel & voxel_to_point in scatter_points.py --- mmcv/ops/scatter_points.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py index e0aec67ce6..46d9b3b83e 100644 --- a/mmcv/ops/scatter_points.py +++ b/mmcv/ops/scatter_points.py @@ -39,11 +39,12 @@ def forward(ctx: Any, ctx.device = feats.device.type if ctx.device == 'npu': import ads_c - voxel_idx = ads_c.point_to_voxel(coors, [], []) + voxel_idx = ads_c.point_to_voxel(coors, [], [], 'XYZ') unique_res = ads_c.unique_voxel(voxel_idx) num_voxels, uniqued_voxel_idx, prefix_sum, \ argsort_coor, _ = unique_res - voxel_coors = ads_c.voxel_to_point(uniqued_voxel_idx, [], []) + voxel_coors = \ + ads_c.voxel_to_point(uniqued_voxel_idx, [], [], 'XYZ') voxel_feats, \ compare_mask = ads_c.npu_dynamic_scatter(feats, coors, prefix_sum, From 0535385343ff7074bb7f964781d6309536a881da Mon Sep 17 00:00:00 2001 From: frh23333 <13806911906@163.com> Date: Mon, 4 Nov 2024 21:06:54 +0800 Subject: [PATCH 21/33] border_align --- .../ops/csrc/pytorch/npu/border_align_npu.cpp | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp b/mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp new file mode 100644 index 0000000000..b1d0004e00 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp @@ -0,0 +1,53 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +void border_align_forward_impl(const Tensor &input, const Tensor &boxes, Tensor output, + Tensor argmax_idx, const int pool_size); + +void border_align_forward_npu(const Tensor &input, const Tensor &boxes, Tensor output, + Tensor argmax_idx, const int pool_size){ + TORCH_CHECK(input.size(0) == boxes.size(0), "The batch sizes of feature map and rois must be the same."); + TORCH_CHECK(input.size(1) % 4 == 0, "The number of channels must be divisible by 4."); + TORCH_CHECK(pool_size >= 2, "The pool size should be larger than 2."); + int32_t batch_size = input.size(0); + int32_t channels = input.size(1); + int32_t height = input.size(2); + int32_t width = input.size(3); + at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous(); + at::Tensor rois_map = boxes.contiguous(); + at::Tensor temp_tensor = at::zeros({batch_size, height * width, pool_size + 1, channels}, input.options()); + EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pool_size, temp_tensor); + auto max_result = temp_tensor.max(-2); + at::Tensor output_ = std::get<0>(max_result).to(at::kFloat); + output_ = output_.reshape({batch_size, height * width, 4, channels / 4}).permute({0, 3, 1, 2}).contiguous(); + output.copy_(output_); + at::Tensor argmax_idx_ = std::get<1>(max_result).to(at::kInt); + argmax_idx_ = argmax_idx_.reshape({batch_size, height * width, 4, channels / 4}).permute({0, 3, 1, 2}).contiguous(); + argmax_idx.copy_(argmax_idx_); +} +REGISTER_NPU_IMPL(border_align_forward_impl, border_align_forward_npu); + + +void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes, + const Tensor &argmax_idx, Tensor grad_input, + const int pool_size); + +void border_align_backward_npu(const Tensor &grad_output, const Tensor &boxes, + const Tensor &argmax_idx, Tensor grad_input, + const int pool_size){ + TORCH_CHECK(grad_output.dim() == 4, "grad_out.dim() must be 4, but got: ", grad_output.dim()); + TORCH_CHECK(boxes.dim() == 3, "idx.dim() must be 3, but got: ", boxes.dim()); + TORCH_CHECK(argmax_idx.dim() == 4, "argmax_idx.dim() must be 4, but got: ", argmax_idx.dim()); + + int32_t batch_size = grad_output.size(0); + int32_t feat_channels = grad_output.size(1) * 4; + int32_t channels = grad_output.size(1); + int32_t box_size = boxes.size(1); + int32_t height = grad_input.size(2); + int32_t width = grad_input.size(3); + + EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_output, boxes, argmax_idx, channels, box_size, height, width, pool_size, batch_size, grad_input); +} +REGISTER_NPU_IMPL(border_align_backward_impl, border_align_backward_npu); From 6c4840200f72c09f190a40d78b49e5e6e82ec547 Mon Sep 17 00:00:00 2001 From: Zac <33156501+ZrBac@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:29:05 +0800 Subject: [PATCH 22/33] Update points_in_boxes.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 去除判断 --- mmcv/ops/points_in_boxes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mmcv/ops/points_in_boxes.py b/mmcv/ops/points_in_boxes.py index 50a085cf46..3069867886 100644 --- a/mmcv/ops/points_in_boxes.py +++ b/mmcv/ops/points_in_boxes.py @@ -133,8 +133,6 @@ def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor: if points.device.type != 'npu': if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) - elif points.device.type == 'npu': - boxes[:, :, 2] += boxes[:, :, 5] / 2.0 ext_module.points_in_boxes_all_forward(boxes.contiguous(), points.contiguous(), box_idxs_of_pts) From 3424ec1daf6e004f3b9ef8e338fdf701f5eafadc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B4=BA=E9=9B=A8=E6=9D=B0?= Date: Wed, 6 Nov 2024 11:15:37 +0800 Subject: [PATCH 23/33] add new npu op roiaware_pool3d --- mmcv/ops/csrc/pytorch/npu/roiaware_pool3d.cpp | 86 +++++++++++++++++++ mmcv/ops/scatter_points.py | 10 +-- 2 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/roiaware_pool3d.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/roiaware_pool3d.cpp b/mmcv/ops/csrc/pytorch/npu/roiaware_pool3d.cpp new file mode 100644 index 0000000000..50706df867 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/roiaware_pool3d.cpp @@ -0,0 +1,86 @@ +#include "pytorch_npu_helper.hpp" +using namespace NPU_NAME_SPACE; +using namespace std; + +void roiaware_pool3d_forward_npu(int boxes_num, int pts_num, int channels, + int max_pts_each_voxel, int out_x, int out_y, + int out_z, const Tensor rois, const Tensor pts, + const Tensor pts_feature, Tensor argmax, + Tensor pts_idx_of_voxels, + Tensor pooled_features, int pool_method) { + at::Tensor rois_cast = rois; + at::Tensor pts_cast = pts; + at::Tensor pts_feature_cast = pts_feature; + at::Tensor pooled_features_cast = pooled_features; + + auto dtype = rois.dtype(); + if (dtype == at::kHalf) { + rois_cast = rois_cast.to(at::kFloat); + pts_cast = pts_cast.to(at::kFloat); + pts_feature_cast = pts_feature_cast.to(at::kFloat); + pooled_features_cast = pooled_features_cast.to(at::kFloat); + } + + EXEC_NPU_CMD(aclnnRoiawarePool3d, rois_cast, pts_cast, pts_feature_cast, + pool_method, max_pts_each_voxel, out_x, out_y, out_z, argmax, + pts_idx_of_voxels, pooled_features_cast); + + if (dtype == at::kHalf) { + pooled_features_cast = pooled_features_cast.to(at::kHalf); + } + + pooled_features.copy_(pooled_features_cast); +} + +void roiaware_pool3d_backward_npu(int boxes_num, int out_x, int out_y, + int out_z, int channels, + int max_pts_each_voxel, + const Tensor pts_idx_of_voxels, + const Tensor argmax, const Tensor grad_out, + Tensor grad_in, int pool_method) +{ + int32_t npoints = grad_in.size(0); + + auto dtype = grad_out.dtype(); + at::Tensor grad_out_cast = grad_out; + at::Tensor grad_in_cast = grad_in; + + if (dtype == at::kHalf) { + grad_out_cast = grad_out.to(at::kFloat); + grad_in_cast = grad_in_cast.to(at::kFloat); + } + + if (pool_method == 0) { + // maxpool3d + EXEC_NPU_CMD(aclnnRoiawareMaxpool3dGrad, argmax, grad_out_cast, boxes_num, + out_x, out_y, out_z, channels, npoints, grad_in_cast); + } else if (pool_method == 1) { + // avgpool3d + EXEC_NPU_CMD(aclnnRoiawareAvgpool3dGrad, pts_idx_of_voxels, grad_out_cast, + boxes_num, out_x, out_y, out_z, channels, npoints, + max_pts_each_voxel, grad_in_cast); + } + + if (dtype == at::kHalf) { + grad_in_cast = grad_in_cast.to(at::kHalf); + } + + grad_in.copy_(grad_in_cast); +} + +void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels, + int max_pts_each_voxel, int out_x, int out_y, + int out_z, const Tensor rois, + const Tensor pts, const Tensor pts_feature, + Tensor argmax, Tensor pts_idx_of_voxels, + Tensor pooled_features, int pool_method); + +void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y, + int out_z, int channels, + int max_pts_each_voxel, + const Tensor pts_idx_of_voxels, + const Tensor argmax, const Tensor grad_out, + Tensor grad_in, int pool_method); + +REGISTER_NPU_IMPL(roiaware_pool3d_forward_impl, roiaware_pool3d_forward_npu); +REGISTER_NPU_IMPL(roiaware_pool3d_backward_impl, roiaware_pool3d_backward_npu); \ No newline at end of file diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py index 68bd28319f..d69a87b358 100644 --- a/mmcv/ops/scatter_points.py +++ b/mmcv/ops/scatter_points.py @@ -38,15 +38,15 @@ def forward(ctx: Any, """ ctx.device = feats.device.type if ctx.device == 'npu': - import ads_c - voxel_idx = ads_c.point_to_voxel(coors, [], [], 'XYZ') - unique_res = ads_c.unique_voxel(voxel_idx) + import mx_driving._C + voxel_idx = mx_driving._C.point_to_voxel(coors, [], [], 'XYZ') + unique_res = mx_driving._C.unique_voxel(voxel_idx) num_voxels, uniqued_voxel_idx, prefix_sum, \ argsort_coor, _ = unique_res voxel_coors = \ - ads_c.voxel_to_point(uniqued_voxel_idx, [], [], 'XYZ') + mx_driving._C.voxel_to_point(uniqued_voxel_idx, [], [], 'XYZ') voxel_feats, \ - compare_mask = ads_c.npu_dynamic_scatter(feats, coors, + compare_mask = mx_driving._C.npu_dynamic_scatter(feats, coors, prefix_sum, argsort_coor, num_voxels, From 4a213871b092ef35b223069c1a817eca7e1f8828 Mon Sep 17 00:00:00 2001 From: ZYF-Annarine Date: Wed, 6 Nov 2024 18:39:41 +0800 Subject: [PATCH 24/33] pixel_grou --- mmcv/ops/pixel_group.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mmcv/ops/pixel_group.py b/mmcv/ops/pixel_group.py index cf73e326da..e1ab1e8693 100644 --- a/mmcv/ops/pixel_group.py +++ b/mmcv/ops/pixel_group.py @@ -10,6 +10,36 @@ ext_module = ext_loader.load_ext('_ext', ['pixel_group']) +def estimate_confidence(label: torch.Tensor, score: torch.Tensor, + label_num: int) -> List[List[float]]: + + import torch_npu + point_vector = torch.zeros((label_num, 2), + dtype=torch.float32).to(score.device) + + label_flat = label.flatten() + score_flat = score.flatten() + + mask = label_flat > 0 + valid_labels = label_flat[mask] + valid_scores = score_flat[mask] + + point_vector.index_add_( + 0, valid_labels, + torch.stack((valid_scores, torch.ones_like(valid_scores)), dim=1)) + + valid_mask = point_vector[:, 1] > 0 + point_vector[valid_mask, 0] /= point_vector[valid_mask, 1] + + point_vector_list = point_vector.tolist() + for l in range(1, label_num): + coords = (label == l).nonzero(as_tuple=False).float() + coords = coords[:, [1, 0]] + point_vector_list[l].extend(coords.flatten().tolist()) + + return point_vector_list + + def pixel_group( score: Union[np.ndarray, Tensor], mask: Union[np.ndarray, Tensor], @@ -59,6 +89,30 @@ def pixel_group( if isinstance(kernel_contour, np.ndarray): kernel_contour = torch.from_numpy(kernel_contour) + if score.device.type == 'npu': + import torch_npu + embedding_dim = embedding.shape[2] + kernel_vector = torch.zeros((kernel_region_num, embedding_dim), + dtype=torch.float32).to(score.device) + + for label in range(1, kernel_region_num): + label_mask = (kernel_label == label) + label_embeddings = embedding[label_mask] + kernel_vector[label, :] = label_embeddings.sum(dim=0) + vector_sum = label_mask.sum() + kernel_vector[label, :] /= vector_sum + + kernel_cv = kernel_vector[label, :] + valid_mask = (mask == 1) & (kernel_label == 0) + valid_embeddings = embedding[valid_mask] + distances = torch.sum((valid_embeddings - kernel_cv)**2, dim=1) + within_threshold = distances < distance_threshold**2 + + kernel_label[valid_mask] = torch.where(within_threshold, label, + kernel_label[valid_mask]) + + return estimate_confidence(kernel_label, score, kernel_region_num) + if torch.__version__ == 'parrots': label = ext_module.pixel_group( score, From eedad4938b491ca3741912b3b4d9e6a8b2fe6462 Mon Sep 17 00:00:00 2001 From: JYYCaN <32977726+JYYCaN@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:56:26 +0800 Subject: [PATCH 25/33] scatter points bug fix --- mmcv/ops/scatter_points.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py index d69a87b358..3061343d2a 100644 --- a/mmcv/ops/scatter_points.py +++ b/mmcv/ops/scatter_points.py @@ -71,13 +71,13 @@ def backward(ctx: Any, grad_voxel_feats: torch.Tensor, grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple: if ctx.device == 'npu': - import ads_c + import mx_driving._C prefix_sum, argsort_coor, compare_mask = ctx.saved_tensors grad_point_feats = torch.zeros( ctx.feats_shape, dtype=grad_voxel_feats.dtype, device=grad_voxel_feats.device) - ads_c.npu_dynamic_scatter_grad(grad_point_feats, + mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum, argsort_coor, compare_mask, ctx.reduce_type) From 7fd21ee94bf225392a30fc7ada389b615e78fc77 Mon Sep 17 00:00:00 2001 From: abdu-uy Date: Sat, 23 Nov 2024 17:21:54 +0800 Subject: [PATCH 26/33] update nms_rotated from openmmlab.mmcv main --- mmcv/ops/nms.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/mmcv/ops/nms.py b/mmcv/ops/nms.py index 2f1d27c4d2..946d016a70 100644 --- a/mmcv/ops/nms.py +++ b/mmcv/ops/nms.py @@ -412,7 +412,7 @@ def nms_rotated(dets: Tensor, else: input_labels = labels - if dets.device.type == 'mlu': + if dets.device.type in ('npu', 'mlu'): order = scores.new_empty(0, dtype=torch.long) keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw, input_labels, iou_threshold, @@ -421,25 +421,6 @@ def nms_rotated(dets: Tensor, dim=1) return dets, keep_inds - if dets.device.type == 'npu': - order = scores.new_empty(0, dtype=torch.long) - coefficient = 57.29578 # 180 / PI - if dets.dtype == torch.float16: - dets_cw = dets_cw.float() - else: - dets_cw = dets_cw.clone() - for i in range(dets.size()[0]): - dets_cw[i][4] *= coefficient # radians to angle - scores = scores.float() - keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw, - input_labels, iou_threshold, - multi_label) - if dets.dtype == torch.float16: - scores = scores.half() - dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)), - dim=1) - return dets, keep_inds - if multi_label: dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1) # type: ignore else: From ca26c89263c0a5949185778b2ea25a1cc7c56ef8 Mon Sep 17 00:00:00 2001 From: qiucan4 Date: Tue, 26 Nov 2024 20:26:43 +0800 Subject: [PATCH 27/33] roi_align_rotated_v2 --- .../pytorch/npu/roi_align_rotated_v2_npu.cpp | 31 ++++++---- mmcv/ops/csrc/pytorch/pybind.cpp | 12 ++-- .../ops/csrc/pytorch/roi_align_rotated_v2.cpp | 16 ++--- mmcv/ops/roi_align_rotated_v2.py | 60 ++++++++++--------- 4 files changed, 66 insertions(+), 53 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp b/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp index b2ea93b261..64248ada45 100644 --- a/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/roi_align_rotated_v2_npu.cpp @@ -3,25 +3,36 @@ using namespace NPU_NAME_SPACE; using namespace std; -void roi_align_rotated_v2_forward_npu(const Tensor input, Tensor rois_map, - Tensor output, +void roi_align_rotated_v2_forward_npu(const Tensor x, Tensor rois_map, + Tensor y, + int32_t pooled_h, + int32_t pooled_w, double spatial_scale, int32_t sampling_ratio, - int32_t pooled_height, - int32_t pooled_width, bool aligned, bool clockwise) { - at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous(); + at::Tensor feature_map = x.permute({0, 2, 3, 1}).contiguous(); at::Tensor rois = rois_map.permute({1, 0}).contiguous(); - EXEC_NPU_CMD(aclnnRoiAlignRotatedV2, feature_map, rois, spatial_scale, sampling_ratio, pooled_height, pooled_width, aligned, clockwise, output); + at_npu::native::OpCommand cmd; + cmd.Name("RoiAlignRotated") + .Input(feature_map) + .Input(rois) + .Output(y) + .Attr("pooled_h", static_cast(pooled_h)) + .Attr("pooled_w", static_cast(pooled_w)) + .Attr("spatial_scale", static_cast(spatial_scale)) + .Attr("sampling_ratio", static_cast(sampling_ratio)) + .Attr("aligned", aligned) + .Attr("clockwise", clockwise) + .Run(); } -void roi_align_rotated_v2_forward_impl(const Tensor input, Tensor rois, - Tensor output, +void roi_align_rotated_v2_forward_impl(const Tensor x, Tensor rois, + Tensor y, + int32_t pooled_h, + int32_t pooled_w, double spatial_scale, int32_t sampling_ratio, - int32_t pooled_height, - int32_t pooled_width, bool aligned, bool clockwise); diff --git a/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/ops/csrc/pytorch/pybind.cpp index 3745bd8978..43e9b270f2 100644 --- a/mmcv/ops/csrc/pytorch/pybind.cpp +++ b/mmcv/ops/csrc/pytorch/pybind.cpp @@ -209,8 +209,8 @@ void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y, int sampling_ratio, int pool_mode, bool aligned); void roi_align_rotated_v2_forward(Tensor input, Tensor rois, Tensor output, + int pooled_h, int pooled_w, double spatial_scale, int sampling_ratio, - int aligned_height, int aligned_width, bool aligned, bool clockwise); void roi_align_rotated_v2_backward(Tensor input, Tensor rois, @@ -343,9 +343,9 @@ void roi_align_rotated_backward(Tensor grad_output, Tensor rois, bool clockwise); void roi_align_rotated_v2_forward(Tensor input, Tensor rois, Tensor output, - double spatial_scale, int sampling_ratio, - int aligned_height, int aligned_width, - bool aligned, bool clockwise); + int pooled_h, int pooled_w, + double spatial_scale, int sampling_ratio, + bool aligned, bool clockwise); void roi_align_rotated_v2_backward(Tensor input, Tensor rois, Tensor grad_output, Tensor grad_input, int pooled_height, int pooled_width, double spatial_scale, @@ -814,8 +814,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise")); m.def("roi_align_rotated_v2_forward", &roi_align_rotated_v2_forward, "roi_align_rotated_v2_forward", py::arg("input"), py::arg("rois"), - py::arg("output"), py::arg("spatial_scale"), py::arg("sampling_ratio"), - py::arg("pooled_height"), py::arg("pooled_width"), + py::arg("output"), py::arg("pooled_h"), py::arg("pooled_w"), + py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise")); m.def("roi_align_rotated_v2_backward", &roi_align_rotated_v2_backward, "roi_align_rotated_v2_backward", py::arg("input"), py::arg("rois"), diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp b/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp index 7743775288..ec2a17bf77 100644 --- a/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp +++ b/mmcv/ops/csrc/pytorch/roi_align_rotated_v2.cpp @@ -2,22 +2,22 @@ #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" -void roi_align_rotated_v2_forward_impl(Tensor input, Tensor rois, Tensor output, +void roi_align_rotated_v2_forward_impl(Tensor x, Tensor rois, Tensor y, + int pooled_h, int pooled_w, double spatial_scale, int sampling_ratio, - int pooled_height, int pooled_width, bool aligned, bool clockwise) { - DISPATCH_DEVICE_IMPL(roi_align_rotated_v2_forward_impl, input, rois, output, - spatial_scale, sampling_ratio, pooled_height, pooled_width, + DISPATCH_DEVICE_IMPL(roi_align_rotated_v2_forward_impl, x, rois, y, + pooled_h, pooled_w, spatial_scale, sampling_ratio, aligned, clockwise); } -void roi_align_rotated_v2_forward(Tensor input, Tensor rois, Tensor output, +void roi_align_rotated_v2_forward(Tensor x, Tensor rois, Tensor y, + int pooled_h, int pooled_w, double spatial_scale, int sampling_ratio, - int pooled_height, int pooled_width, bool aligned, bool clockwise) { - roi_align_rotated_v2_forward_impl(input, rois, output, spatial_scale, sampling_ratio, - pooled_height, pooled_width, aligned, clockwise); + roi_align_rotated_v2_forward_impl(x, rois, y, pooled_h, pooled_w, + spatial_scale, sampling_ratio, aligned, clockwise); } diff --git a/mmcv/ops/roi_align_rotated_v2.py b/mmcv/ops/roi_align_rotated_v2.py index 639fea3a23..80c97ae736 100644 --- a/mmcv/ops/roi_align_rotated_v2.py +++ b/mmcv/ops/roi_align_rotated_v2.py @@ -14,55 +14,55 @@ class RoIAlignRotatedV2Function(Function): @staticmethod - def symbolic(g, input, rois, spatial_scale, sampling_ratio, pooled_height, - pooled_width, aligned, clockwise): + def symbolic(g, x, rois, spatial_scale, sampling_ratio, pooled_h, + pooled_w, aligned, clockwise): return g.op( 'mmcv::MMCVRoIAlignRotatedV2', - input, + x, rois, + pooled_h=pooled_h, + pooled_w=pooled_w, spatial_scale_f=spatial_scale, sampling_ratio_i=sampling_ratio, - pooled_height=pooled_height, - pooled_width=pooled_width, aligned_i=aligned, clockwise_i=clockwise) @staticmethod def forward(ctx: Any, - input: torch.Tensor, + x: torch.Tensor, rois: torch.Tensor, + pooled_h: int, + pooled_w: int, spatial_scale: float, sampling_ratio: int, - pooled_height: int, - pooled_width: int, aligned: bool = True, clockwise: bool = False) -> torch.Tensor: - ctx.pooled_height = pooled_height - ctx.pooled_width = pooled_width + ctx.pooled_h = pooled_h + ctx.pooled_w = pooled_w ctx.spatial_scale = spatial_scale ctx.sampling_ratio = sampling_ratio ctx.aligned = aligned ctx.clockwise = clockwise - ctx.save_for_backward(input, rois) - ctx.feature_size = input.size() - batch_size, num_channels, data_height, data_width = input.size() + ctx.save_for_backward(x, rois) + ctx.feature_size = x.size() + batch_size, num_channels, data_height, data_width = x.size() num_rois = rois.size(0) - output = input.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, + y = x.new_zeros(num_rois, ctx.pooled_h, ctx.pooled_w, num_channels) ext_module.roi_align_rotated_v2_forward( - input, + x, rois, - output, + y, + pooled_h=ctx.pooled_h, + pooled_w=ctx.pooled_w, spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, - pooled_height=ctx.pooled_height, - pooled_width=ctx.pooled_width, aligned=ctx.aligned, clockwise=ctx.clockwise) - output = output.transpose(2, 3).transpose(1, 2).contiguous() - return output + y = y.transpose(2, 3).transpose(1, 2).contiguous() + return y @staticmethod def backward(ctx: Any, grad_output: torch.Tensor): @@ -74,7 +74,7 @@ def backward(ctx: Any, grad_output: torch.Tensor): input.size(0), input.size(2), input.size(3), input.size(1)) ext_module.roi_align_rotated_v2_backward( input, rois_trans, grad_output_trans, grad_input, - ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, + ctx.pooled_h, ctx.pooled_w, ctx.spatial_scale, ctx.sampling_ratio, ctx.aligned, ctx.clockwise) grad_input = grad_input.permute(0, 3, 1, 2).contiguous() @@ -134,31 +134,33 @@ class RoIAlignRotatedV2(nn.Module): }, cls_name='RoIAlignRotatedV2') def __init__(self, + pooled_h: int, + pooled_w: int, spatial_scale: float, sampling_ratio: int, - pooled_height: int, - pooled_width: int, aligned: bool = True, clockwise: bool = False): super().__init__() - self.pooled_height = int(pooled_height) - self.pooled_width = int(pooled_width) + self.pooled_h = int(pooled_h) + self.pooled_w = int(pooled_w) self.spatial_scale = float(spatial_scale) self.sampling_ratio = int(sampling_ratio) self.aligned = aligned self.clockwise = clockwise def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: - return RoIAlignRotatedV2Function.apply(input, rois, self.spatial_scale, + return RoIAlignRotatedV2Function.apply(input, rois, + self.pooled_h, + self.pooled_w, + self.spatial_scale, self.sampling_ratio, - self.pooled_height, - self.pooled_width, self.aligned, + self.aligned, self.clockwise) def __repr__(self): s = self.__class__.__name__ - s += f'(pooled_height={self.pooled_height}, ' + s += f'(pooled_h={self.pooled_h}, ' s += f'spatial_scale={self.spatial_scale}, ' s += f'sampling_ratio={self.sampling_ratio}, ' s += f'aligned={self.aligned}, ' From cf237188d79af7ab542997700840d559a40e4db9 Mon Sep 17 00:00:00 2001 From: Bosco-lab <1070506085@qq.com> Date: Wed, 27 Nov 2024 16:01:28 +0800 Subject: [PATCH 28/33] add pixel_group_npu --- mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp | 55 ++++++++++++++++++ mmcv/ops/pixel_group.py | 56 +------------------ 2 files changed, 56 insertions(+), 55 deletions(-) create mode 100644 mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp b/mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp new file mode 100644 index 0000000000..eb937d50fa --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp @@ -0,0 +1,55 @@ +#include "pytorch_npu_helper.hpp" + +using namespace NPU_NAME_SPACE; +using namespace std; + +vector> pixel_group_npu(Tensor score, Tensor mask, Tensor embedding, + Tensor kernel_label, Tensor kernel_contour, + int kernel_region_num, float distance_threshold) { + TORCH_CHECK(score.dim() == 2, "score.dim() must be 2, but got: ", score.dim()); + TORCH_CHECK(mask.dim() == 2, "mask.dim() must be 2, but got: ", mask.dim()); + TORCH_CHECK(embedding.dim() == 3, "embedding.dim() must be 3, but got: ", embedding.dim()); + TORCH_CHECK(kernel_label.dim() == 2, "kernel_label.dim() must be 2, but got: ", kernel_label.dim()); + TORCH_CHECK(kernel_contour.dim() == 2, "kernel_contour.dim() must be 2, but got: ", kernel_contour.dim()); + + auto label_size = kernel_label.sizes(); + auto height = label_size[0]; + auto width = label_size[1]; + + c10::SmallVector point_vector_size = {kernel_region_num, 2}; + c10::SmallVector label_updated_size = {height, width}; + at::Tensor point_vector = at::zeros(point_vector_size, score.options()); + at::Tensor label_updated = at::empty(label_updated_size, kernel_label.options()); + + EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label, kernel_contour, + kernel_region_num, distance_threshold, point_vector, label_updated); + + std::vector> pixel_assignment(kernel_region_num); + at::Tensor point_vector_cpu = point_vector.to(at::kCPU); + at::Tensor label_updated_cpu = label_updated.to(at::kCPU); + + for (int32_t l = 0; l < kernel_region_num; l++) { + pixel_assignment[l].push_back(point_vector_cpu[l][0].item()); + pixel_assignment[l].push_back(point_vector_cpu[l][1].item()); + if (pixel_assignment[l][1] > 0) { + pixel_assignment[l][0] /= pixel_assignment[l][1]; + } + if (l > 0) { + at::Tensor valid_mask = (label_updated_cpu == l); + at::Tensor indices = at::nonzero(valid_mask); + for (int32_t i = 0; i < indices.size(0); i++) { + auto x = indices[i][0].item(); + auto y = indices[i][1].item(); + pixel_assignment[l].push_back(y); + pixel_assignment[l].push_back(x); + } + } + } + return pixel_assignment; +} + +vector> pixel_group_impl(Tensor score, Tensor mask, Tensor embedding, + Tensor kernel_label, Tensor kernel_contour, + int kernel_region_num, float distance_threshold); + +REGISTER_NPU_IMPL(pixel_group_impl, pixel_group_npu); \ No newline at end of file diff --git a/mmcv/ops/pixel_group.py b/mmcv/ops/pixel_group.py index e1ab1e8693..bf48684e6e 100644 --- a/mmcv/ops/pixel_group.py +++ b/mmcv/ops/pixel_group.py @@ -10,36 +10,6 @@ ext_module = ext_loader.load_ext('_ext', ['pixel_group']) -def estimate_confidence(label: torch.Tensor, score: torch.Tensor, - label_num: int) -> List[List[float]]: - - import torch_npu - point_vector = torch.zeros((label_num, 2), - dtype=torch.float32).to(score.device) - - label_flat = label.flatten() - score_flat = score.flatten() - - mask = label_flat > 0 - valid_labels = label_flat[mask] - valid_scores = score_flat[mask] - - point_vector.index_add_( - 0, valid_labels, - torch.stack((valid_scores, torch.ones_like(valid_scores)), dim=1)) - - valid_mask = point_vector[:, 1] > 0 - point_vector[valid_mask, 0] /= point_vector[valid_mask, 1] - - point_vector_list = point_vector.tolist() - for l in range(1, label_num): - coords = (label == l).nonzero(as_tuple=False).float() - coords = coords[:, [1, 0]] - point_vector_list[l].extend(coords.flatten().tolist()) - - return point_vector_list - - def pixel_group( score: Union[np.ndarray, Tensor], mask: Union[np.ndarray, Tensor], @@ -89,30 +59,6 @@ def pixel_group( if isinstance(kernel_contour, np.ndarray): kernel_contour = torch.from_numpy(kernel_contour) - if score.device.type == 'npu': - import torch_npu - embedding_dim = embedding.shape[2] - kernel_vector = torch.zeros((kernel_region_num, embedding_dim), - dtype=torch.float32).to(score.device) - - for label in range(1, kernel_region_num): - label_mask = (kernel_label == label) - label_embeddings = embedding[label_mask] - kernel_vector[label, :] = label_embeddings.sum(dim=0) - vector_sum = label_mask.sum() - kernel_vector[label, :] /= vector_sum - - kernel_cv = kernel_vector[label, :] - valid_mask = (mask == 1) & (kernel_label == 0) - valid_embeddings = embedding[valid_mask] - distances = torch.sum((valid_embeddings - kernel_cv)**2, dim=1) - within_threshold = distances < distance_threshold**2 - - kernel_label[valid_mask] = torch.where(within_threshold, label, - kernel_label[valid_mask]) - - return estimate_confidence(kernel_label, score, kernel_region_num) - if torch.__version__ == 'parrots': label = ext_module.pixel_group( score, @@ -137,4 +83,4 @@ def pixel_group( kernel_label, kernel_contour, kernel_region_num, distance_threshold) - return pixel_assignment + return pixel_assignment \ No newline at end of file From a752d175c0406849003852f2d9c3bab5eb4ab163 Mon Sep 17 00:00:00 2001 From: Jing-Yi Li Date: Mon, 16 Dec 2024 08:24:13 +0800 Subject: [PATCH 29/33] modify internal calls of npu boxes_overlap_bev & box_iou_rotated --- .../csrc/pytorch/npu/box_iou_rotated_npu.cpp | 6 +----- .../pytorch/npu/boxes_overlap_bev_npu.cpp | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp index d8b0bbaa67..fd68af68d9 100644 --- a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp @@ -8,14 +8,10 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious, const int mode_flag, const bool aligned) { - TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)"); TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)"); - auto trans = false; - auto is_clockwise = false; - EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise, - aligned, mode_flag, ious); + EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious); return; } diff --git a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp index 6bc6273083..3b59f52e2f 100644 --- a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp @@ -10,16 +10,19 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a, void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a, const int num_b, const Tensor boxes_b, Tensor ans_overlap) { + TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)"); + TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)"); - TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)"); - TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)"); + auto format_flag = 3; + auto clockwise = true; + auto mode_flag = 0; + auto aligned = false; + auto margin = 1e-2; - auto trans = false; - auto is_clockwise = false; - auto aligned = false; - auto mode_flag = 2; - EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, trans, is_clockwise, aligned, mode_flag, ans_overlap); - return; + EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, format_flag, clockwise, + mode_flag, aligned, margin, ans_overlap); + return; } -REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, iou3d_boxes_overlap_bev_forward_npu); +REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, + iou3d_boxes_overlap_bev_forward_npu); From 0aa84d293f51c8e17bbc782aafe09c2e31b8e1e2 Mon Sep 17 00:00:00 2001 From: ZYF-Annarine Date: Tue, 7 Jan 2025 16:48:28 +0800 Subject: [PATCH 30/33] git checkout origin pixel_group --- mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp | 55 ------------------- mmcv/ops/pixel_group.py | 52 ++++++++++++++++++ 2 files changed, 52 insertions(+), 55 deletions(-) delete mode 100644 mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp b/mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp deleted file mode 100644 index eb937d50fa..0000000000 --- a/mmcv/ops/csrc/pytorch/npu/pixel_group_npu.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include "pytorch_npu_helper.hpp" - -using namespace NPU_NAME_SPACE; -using namespace std; - -vector> pixel_group_npu(Tensor score, Tensor mask, Tensor embedding, - Tensor kernel_label, Tensor kernel_contour, - int kernel_region_num, float distance_threshold) { - TORCH_CHECK(score.dim() == 2, "score.dim() must be 2, but got: ", score.dim()); - TORCH_CHECK(mask.dim() == 2, "mask.dim() must be 2, but got: ", mask.dim()); - TORCH_CHECK(embedding.dim() == 3, "embedding.dim() must be 3, but got: ", embedding.dim()); - TORCH_CHECK(kernel_label.dim() == 2, "kernel_label.dim() must be 2, but got: ", kernel_label.dim()); - TORCH_CHECK(kernel_contour.dim() == 2, "kernel_contour.dim() must be 2, but got: ", kernel_contour.dim()); - - auto label_size = kernel_label.sizes(); - auto height = label_size[0]; - auto width = label_size[1]; - - c10::SmallVector point_vector_size = {kernel_region_num, 2}; - c10::SmallVector label_updated_size = {height, width}; - at::Tensor point_vector = at::zeros(point_vector_size, score.options()); - at::Tensor label_updated = at::empty(label_updated_size, kernel_label.options()); - - EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label, kernel_contour, - kernel_region_num, distance_threshold, point_vector, label_updated); - - std::vector> pixel_assignment(kernel_region_num); - at::Tensor point_vector_cpu = point_vector.to(at::kCPU); - at::Tensor label_updated_cpu = label_updated.to(at::kCPU); - - for (int32_t l = 0; l < kernel_region_num; l++) { - pixel_assignment[l].push_back(point_vector_cpu[l][0].item()); - pixel_assignment[l].push_back(point_vector_cpu[l][1].item()); - if (pixel_assignment[l][1] > 0) { - pixel_assignment[l][0] /= pixel_assignment[l][1]; - } - if (l > 0) { - at::Tensor valid_mask = (label_updated_cpu == l); - at::Tensor indices = at::nonzero(valid_mask); - for (int32_t i = 0; i < indices.size(0); i++) { - auto x = indices[i][0].item(); - auto y = indices[i][1].item(); - pixel_assignment[l].push_back(y); - pixel_assignment[l].push_back(x); - } - } - } - return pixel_assignment; -} - -vector> pixel_group_impl(Tensor score, Tensor mask, Tensor embedding, - Tensor kernel_label, Tensor kernel_contour, - int kernel_region_num, float distance_threshold); - -REGISTER_NPU_IMPL(pixel_group_impl, pixel_group_npu); \ No newline at end of file diff --git a/mmcv/ops/pixel_group.py b/mmcv/ops/pixel_group.py index bf48684e6e..c03f80a13f 100644 --- a/mmcv/ops/pixel_group.py +++ b/mmcv/ops/pixel_group.py @@ -9,6 +9,34 @@ ext_module = ext_loader.load_ext('_ext', ['pixel_group']) +def estimate_confidence(label: torch.Tensor, score: torch.Tensor, + label_num: int) -> List[List[float]]: + + import torch_npu + point_vector = torch.zeros((label_num, 2), + dtype=torch.float32).to(score.device) + + label_flat = label.flatten() + score_flat = score.flatten() + + mask = label_flat > 0 + valid_labels = label_flat[mask] + valid_scores = score_flat[mask] + + point_vector.index_add_( + 0, valid_labels, + torch.stack((valid_scores, torch.ones_like(valid_scores)), dim=1)) + + valid_mask = point_vector[:, 1] > 0 + point_vector[valid_mask, 0] /= point_vector[valid_mask, 1] + + point_vector_list = point_vector.tolist() + for l in range(1, label_num): + coords = (label == l).nonzero(as_tuple=False).float() + coords = coords[:, [1, 0]] + point_vector_list[l].extend(coords.flatten().tolist()) + + return point_vector_list def pixel_group( score: Union[np.ndarray, Tensor], @@ -59,6 +87,30 @@ def pixel_group( if isinstance(kernel_contour, np.ndarray): kernel_contour = torch.from_numpy(kernel_contour) + if score.device.type == 'npu': + import torch_npu + embedding_dim = embedding.shape[2] + kernel_vector = torch.zeros((kernel_region_num, embedding_dim), + dtype=torch.float32).to(score.device) + + for label in range(1, kernel_region_num): + label_mask = (kernel_label == label) + label_embeddings = embedding[label_mask] + kernel_vector[label, :] = label_embeddings.sum(dim=0) + vector_sum = label_mask.sum() + kernel_vector[label, :] /= vector_sum + + kernel_cv = kernel_vector[label, :] + valid_mask = (mask == 1) & (kernel_label == 0) + valid_embeddings = embedding[valid_mask] + distances = torch.sum((valid_embeddings - kernel_cv)**2, dim=1) + within_threshold = distances < distance_threshold**2 + + kernel_label[valid_mask] = torch.where(within_threshold, label, + kernel_label[valid_mask]) + + return estimate_confidence(kernel_label, score, kernel_region_num) + if torch.__version__ == 'parrots': label = ext_module.pixel_group( score, From 540ac07c923c36dce7bd091fe8887e2cb7276100 Mon Sep 17 00:00:00 2001 From: Hua-yuxiu Date: Tue, 7 Jan 2025 17:51:11 +0800 Subject: [PATCH 31/33] add impl of assign_score_withk backward --- .../pytorch/npu/assign_score_withk_npu.cpp | 21 +++ tests/test_ops/test_assign_score_withk.py | 147 +++++++++--------- 2 files changed, 94 insertions(+), 74 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp b/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp index 8945974a6b..8f5db077a0 100644 --- a/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp @@ -21,3 +21,24 @@ void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O, const Tensor& knn_idx, Tensor& output); REGISTER_NPU_IMPL(assign_score_withk_forward_impl, assign_score_withk_forward_npu); + + +void assign_score_withk_backward_npu( + int B, int N0, int N1, int M, int K, int O, int aggregate, + const Tensor& grad_out, const Tensor& points, const Tensor& centers, + const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points, + Tensor& grad_centers, Tensor& grad_scores) { + + at::Tensor grad_out_trans = grad_out.permute({0, 2, 3, 1}); + + EXEC_NPU_CMD(aclnnAssignScoreWithkGrad, grad_out_trans, points, centers, scores, knn_idx, B, N0, N1, M, K, O, aggregate, grad_scores, grad_points, grad_centers); +} + +void assign_score_withk_backward_impl( + int B, int N0, int N1, int M, int K, int O, int aggregate, + const Tensor& grad_out, const Tensor& points, const Tensor& centers, + const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points, + Tensor& grad_centers, Tensor& grad_scores); + +REGISTER_NPU_IMPL(assign_score_withk_backward_impl, assign_score_withk_backward_npu); + diff --git a/tests/test_ops/test_assign_score_withk.py b/tests/test_ops/test_assign_score_withk.py index bd27144545..d778121c74 100644 --- a/tests/test_ops/test_assign_score_withk.py +++ b/tests/test_ops/test_assign_score_withk.py @@ -126,77 +126,76 @@ def test_paconv_assign_scores(device): assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6) # test backward - if device == 'cuda': - loss = output.sum() - loss.backward() - expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683], - [-0.78873926, 0.7485497], - [-0.6866992, 0.05346543], - [0.04288036, -0.18217683]], - [[-1.1407862, 0.13533896], - [-0.06964391, -0.22948086], - [-1.1407862, 0.13533896], - [-0.06964391, -0.22948086]]], - [[[-0.3363995, -2.212181], - [-1.1589496, -2.7724311], - [-0.9387654, -1.3163853], - [-1.4385346, -1.0614843]], - [[-0.5048497, 1.4143617], - [-0.47332114, 0.6017133], - [-0.30974793, 1.1995442], - [-0.5048497, - 1.4143617]]]]).float() - expected_points_grad = torch.tensor( - [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0.15585709, 0.15585709, 0.15585709, 0.15585709], - [1.1893613, 1.1893613, 1.1893613, 1.1893613]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[1.6530733, 1.6530733, 1.6530733, 1.6530733], - [1.8130021, 1.8130021, 1.8130021, 1.8130021]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0.58863074, 0.58863074, 0.58863074, 0.58863074], - [1.3727596, 1.3727596, 1.3727596, 1.3727596]], - [[0.28462553, 0.28462553, 0.28462553, 0.28462553], - [0.8378516, 0.8378516, 0.8378516, 0.8378516]]], - [[[0.13817799, 0.13817799, 0.13817799, 0.13817799], - [0.34856772, 0.34856772, 0.34856772, 0.34856772]], - [[0.7405102, 0.7405102, 0.7405102, 0.7405102], - [0.06438422, 0.06438422, 0.06438422, 0.06438422]], - [[0.8491963, 0.8491963, 0.8491963, 0.8491963], - [1.1301711, 1.1301711, 1.1301711, 1.1301711]], - [[0.6887394, 0.6887394, 0.6887394, 0.6887394], - [0.22089851, 0.22089851, 0.22089851, 0.22089851]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0.605832, 0.605832, 0.605832, 0.605832], - [0.92364264, 0.92364264, 0.92364264, 0.92364264]], - [[0.23089725, 0.23089725, 0.23089725, 0.23089725], - [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float() - expected_centers_grad = torch.tensor( - [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[-1.0493311, -1.0493311, -1.0493311, -1.0493311], - [-2.0301602, -2.0301602, -2.0301602, -2.0301602]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[-1.6328557, -1.6328557, -1.6328557, -1.6328557], - [-3.1828144, -3.1828144, -3.1828144, -3.1828144]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]]], - [[[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[0., 0., 0., 0.], [0., 0., 0., 0.]], - [[-1.5429721, -1.5429721, -1.5429721, -1.5429721], - [-1.6100934, -1.6100934, -1.6100934, -1.6100934]], - [[-1.7103812, -1.7103812, -1.7103812, -1.7103812], - [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float() - assert torch.allclose( - scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6) - assert torch.allclose( - points.grad.detach().cpu(), expected_points_grad, atol=1e-6) - assert torch.allclose( - centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6) + loss = output.sum() + loss.backward() + expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683], + [-0.78873926, 0.7485497], + [-0.6866992, 0.05346543], + [0.04288036, -0.18217683]], + [[-1.1407862, 0.13533896], + [-0.06964391, -0.22948086], + [-1.1407862, 0.13533896], + [-0.06964391, -0.22948086]]], + [[[-0.3363995, -2.212181], + [-1.1589496, -2.7724311], + [-0.9387654, -1.3163853], + [-1.4385346, -1.0614843]], + [[-0.5048497, 1.4143617], + [-0.47332114, 0.6017133], + [-0.30974793, 1.1995442], + [-0.5048497, + 1.4143617]]]]).float() + expected_points_grad = torch.tensor( + [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0.15585709, 0.15585709, 0.15585709, 0.15585709], + [1.1893613, 1.1893613, 1.1893613, 1.1893613]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[1.6530733, 1.6530733, 1.6530733, 1.6530733], + [1.8130021, 1.8130021, 1.8130021, 1.8130021]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0.58863074, 0.58863074, 0.58863074, 0.58863074], + [1.3727596, 1.3727596, 1.3727596, 1.3727596]], + [[0.28462553, 0.28462553, 0.28462553, 0.28462553], + [0.8378516, 0.8378516, 0.8378516, 0.8378516]]], + [[[0.13817799, 0.13817799, 0.13817799, 0.13817799], + [0.34856772, 0.34856772, 0.34856772, 0.34856772]], + [[0.7405102, 0.7405102, 0.7405102, 0.7405102], + [0.06438422, 0.06438422, 0.06438422, 0.06438422]], + [[0.8491963, 0.8491963, 0.8491963, 0.8491963], + [1.1301711, 1.1301711, 1.1301711, 1.1301711]], + [[0.6887394, 0.6887394, 0.6887394, 0.6887394], + [0.22089851, 0.22089851, 0.22089851, 0.22089851]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0.605832, 0.605832, 0.605832, 0.605832], + [0.92364264, 0.92364264, 0.92364264, 0.92364264]], + [[0.23089725, 0.23089725, 0.23089725, 0.23089725], + [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float() + expected_centers_grad = torch.tensor( + [[[[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[-1.0493311, -1.0493311, -1.0493311, -1.0493311], + [-2.0301602, -2.0301602, -2.0301602, -2.0301602]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[-1.6328557, -1.6328557, -1.6328557, -1.6328557], + [-3.1828144, -3.1828144, -3.1828144, -3.1828144]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]]], + [[[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[0., 0., 0., 0.], [0., 0., 0., 0.]], + [[-1.5429721, -1.5429721, -1.5429721, -1.5429721], + [-1.6100934, -1.6100934, -1.6100934, -1.6100934]], + [[-1.7103812, -1.7103812, -1.7103812, -1.7103812], + [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float() + assert torch.allclose( + scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6) + assert torch.allclose( + points.grad.detach().cpu(), expected_points_grad, atol=1e-6) + assert torch.allclose( + centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6) From 4095faf7fa9e065c7017716e961af88319bf4c0b Mon Sep 17 00:00:00 2001 From: hust17yixuan <303660421@qq.com> Date: Wed, 8 Jan 2025 17:24:03 +0800 Subject: [PATCH 32/33] Revert "modify internal calls of npu boxes_overlap_bev & box_iou_rotated" This reverts commit a752d175c0406849003852f2d9c3bab5eb4ab163. --- .../csrc/pytorch/npu/box_iou_rotated_npu.cpp | 6 +++++- .../pytorch/npu/boxes_overlap_bev_npu.cpp | 21 ++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp index fd68af68d9..d8b0bbaa67 100644 --- a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp @@ -8,10 +8,14 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious, const int mode_flag, const bool aligned) { + TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)"); TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)"); - EXEC_NPU_CMD(aclnnBoxIou, boxes1, boxes2, mode_flag, aligned, ious); + auto trans = false; + auto is_clockwise = false; + EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise, + aligned, mode_flag, ious); return; } diff --git a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp index 3b59f52e2f..6bc6273083 100644 --- a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp +++ b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp @@ -10,19 +10,16 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a, void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a, const int num_b, const Tensor boxes_b, Tensor ans_overlap) { - TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)"); - TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)"); - auto format_flag = 3; - auto clockwise = true; - auto mode_flag = 0; - auto aligned = false; - auto margin = 1e-2; + TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)"); + TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)"); - EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, format_flag, clockwise, - mode_flag, aligned, margin, ans_overlap); - return; + auto trans = false; + auto is_clockwise = false; + auto aligned = false; + auto mode_flag = 2; + EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, trans, is_clockwise, aligned, mode_flag, ans_overlap); + return; } -REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, - iou3d_boxes_overlap_bev_forward_npu); +REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, iou3d_boxes_overlap_bev_forward_npu); From 0937678cf168362ad6b28ec48ad56acf641dbda8 Mon Sep 17 00:00:00 2001 From: yuansunshun Date: Mon, 13 Jan 2025 19:21:28 +0800 Subject: [PATCH 33/33] add diou_npu --- .../csrc/pytorch/npu/diff_iou_rotated_npu.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp diff --git a/mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp new file mode 100644 index 0000000000..7091a8ec3d --- /dev/null +++ b/mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp @@ -0,0 +1,28 @@ +#include "pytorch_npu_helper.hpp" +using namespace NPU_NAME_SPACE; +using namespace std; + +Tensor diff_iou_rotated_sort_vertices_npu(Tensor vertices, + Tensor mask, + Tensor num_valid) { + TORCH_CHECK(vertices.dim() == 4, "vertices must be a 4D Tensor, but got: ", vertices.dim()); + TORCH_CHECK(mask.dim() == 3, "mask must be a 3D Tensor, but got: ", mask.dim()); + TORCH_CHECK(num_valid.dim() == 2, "num_valid must be a 2D Tensor, but got: ", num_valid.dim()); + + uint32_t B = vertices.size(0); + uint32_t N = vertices.size(1); + + at::Tensor sortedIdx = at::empty({B, N, 9}, num_valid.options()); + at::Tensor mask_fp = mask.to(at::kFloat); + + EXEC_NPU_CMD(aclnnDiffIouRotatedSortVertices, vertices, mask_fp, num_valid, sortedIdx); + + return sortedIdx; +} + +Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, + Tensor mask, + Tensor num_valid); + +REGISTER_NPU_IMPL(diff_iou_rotated_sort_vertices_forward_impl, + diff_iou_rotated_sort_vertices_npu);