diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index b5fbcfdb..5441c1d1 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -24,7 +24,12 @@ def sample_augmentation(self, results):
H, W = results["ori_shape"]
fH, fW = self.final_dim
if self.is_train:
- resize = np.random.uniform(*self.resize_lim)
+ if isinstance(self.resize_lim, (int, float)):
+ aspect_ratio = min(fH / H, fW / W)
+ resize = np.random.uniform(aspect_ratio - self.resize_lim, aspect_ratio + self.resize_lim)
+ else:
+ resize = np.random.uniform(*self.resize_lim)
+
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.random.uniform(*self.bot_pct_lim)) * newH) - fH
@@ -35,7 +40,7 @@ def sample_augmentation(self, results):
flip = True
rotate = np.random.uniform(*self.rot_lim)
else:
- resize = np.mean(self.resize_lim)
+ resize = min(fH / H, fW / W)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.mean(self.bot_pct_lim)) * newH) - fH
@@ -52,7 +57,7 @@ def img_transform(self, img, rotation, translation, resize, resize_dims, crop, f
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
- img = img.rotate(rotate)
+ img = img.rotate(rotate, resample=Image.BICUBIC) # Default rotation introduces artifacts.
# post-homography transformation
rotation *= resize
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_voxel_second_secfpn_2xb2_t4offline_no_intensity.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_offline_voxel_second_secfpn_4xb8_base.py
similarity index 80%
rename from projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_voxel_second_secfpn_2xb2_t4offline_no_intensity.py
rename to projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_offline_voxel_second_secfpn_4xb8_base.py
index 4813b5d8..130a4a89 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_voxel_second_secfpn_2xb2_t4offline_no_intensity.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_offline_voxel_second_secfpn_4xb8_base.py
@@ -8,9 +8,10 @@
# user setting
data_root = "data/t4dataset/"
-info_directory_path = "info/user_name/"
-train_gpu_size = 2
+info_directory_path = "info/username/"
+train_gpu_size = 4
train_batch_size = 2
+test_batch_size = 2
val_interval = 5
max_epochs = 30
backend_args = None
@@ -19,43 +20,45 @@
point_cloud_range = [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0]
voxel_size = [0.075, 0.075, 0.2]
grid_size = [3264, 3264, 41]
+
eval_class_range = {
- "car": 121,
- "truck": 121,
- "bus": 121,
- "bicycle": 121,
- "pedestrian": 121,
+ "car": 120,
+ "truck": 120,
+ "bus": 120,
+ "bicycle": 120,
+ "pedestrian": 120,
}
# model parameter
input_modality = dict(use_lidar=True, use_camera=True)
point_load_dim = 5 # x, y, z, intensity, ring_id
-point_use_dim = 5
-point_intensity_dim = 3
+sweeps_num = 1
max_num_points = 10
max_voxels = [120000, 160000]
num_proposals = 500
-image_size = [256, 704]
-lidar_sweep_dims = [0, 1, 2, 4]
-num_workers = 1
-sweeps_num = 1
+image_size = [576, 864] # height, width
+num_workers = 32
+lidar_sweep_dims = [0, 1, 2, 4] # x, y, z, time_lag
+lidar_feature_dims = 4
model = dict(
type="BEVFusion",
data_preprocessor=dict(
+ type="Det3DDataPreprocessor",
+ pad_size_divisor=32,
voxelize_cfg=dict(
max_num_points=max_num_points,
- point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
max_voxels=max_voxels,
+ deterministic=True,
+ voxelize_reduce=True,
),
- type="Det3DDataPreprocessor",
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=False,
),
- pts_voxel_encoder=dict(type="HardSimpleVFE", num_features=4),
- pts_middle_encoder=dict(in_channels=4, sparse_shape=grid_size),
+ pts_middle_encoder=dict(sparse_shape=grid_size, in_channels=lidar_feature_dims),
img_backbone=dict(
type="mmdet.SwinTransformer",
embed_dims=96,
@@ -92,16 +95,11 @@
in_channels=256,
out_channels=80,
image_size=image_size,
- feature_size=[32, 88],
- # xbound=[-54.0, 54.0, 0.3],
- # ybound=[-54.0, 54.0, 0.3],
- # xbound=[-122.4, 122.4, 0.68],
- # ybound=[-122.4, 122.4, 0.68],
+ feature_size=[72, 108],
xbound=[-122.4, 122.4, 0.3],
ybound=[-122.4, 122.4, 0.3],
zbound=[-10.0, 10.0, 20.0],
- # dbound=[1.0, 60.0, 0.5],
- dbound=[1.0, 166.2, 1.4],
+ dbound=[1.0, 134, 1.4],
downsample=2,
),
fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256),
@@ -115,6 +113,7 @@
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
),
test_cfg=dict(
+ dataset="t4datasets",
grid_size=grid_size,
voxel_size=voxel_size[0:2],
pc_range=point_cloud_range[0:2],
@@ -124,49 +123,13 @@
voxel_size=voxel_size[0:2],
),
),
+ # Lidar pipeline
+ pts_voxel_encoder=dict(num_features=lidar_feature_dims),
)
-# TODO: support object sample
-# db_sampler = dict(
-# data_root=data_root,
-# info_path=data_root +'nuscenes_dbinfos_train.pkl',
-# rate=1.0,
-# prepare=dict(
-# filter_by_difficulty=[-1],
-# filter_by_min_points=dict(
-# car=5,
-# truck=5,
-# bus=5,
-# trailer=5,
-# construction_vehicle=5,
-# traffic_cone=5,
-# barrier=5,
-# motorcycle=5,
-# bicycle=5,
-# pedestrian=5)),
-# classes=class_names,
-# sample_groups=dict(
-# car=2,
-# truck=3,
-# construction_vehicle=7,
-# bus=4,
-# trailer=6,
-# barrier=2,
-# motorcycle=6,
-# bicycle=6,
-# pedestrian=2,
-# traffic_cone=2),
-# points_loader=dict(
-# type='LoadPointsFromFile',
-# coord_type='LIDAR',
-# load_dim=5,
-# use_dim=[0, 1, 2, 3, 4],
-# backend_args=backend_args))
-
train_pipeline = [
dict(
type="BEVLoadMultiViewImageFromFiles",
- data_root=data_root,
to_float32=True,
color_type="color",
backend_args=backend_args,
@@ -175,39 +138,31 @@
type="LoadPointsFromFile",
coord_type="LIDAR",
load_dim=point_load_dim,
- use_dim=point_use_dim,
+ use_dim=point_load_dim,
backend_args=backend_args,
),
- # TODO: add feature
- # dict(
- # type="IntensityNorm",
- # alpha=10.0,
- # intensity_dim=point_intensity_dim,
- # div_factor=255.0,
- # ),
dict(
type="LoadPointsFromMultiSweeps",
sweeps_num=sweeps_num,
- load_dim=5,
+ load_dim=point_load_dim,
use_dim=lidar_sweep_dims,
pad_empty_sweeps=True,
remove_close=True,
backend_args=backend_args,
+ test_mode=False,
),
dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
- # TODO: support object sample
- # dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type="ImageAug3D",
final_dim=image_size,
- resize_lim=[0.38, 0.55],
+ resize_lim=0.02,
bot_pct_lim=[0.0, 0.0],
rot_lim=[-5.4, 5.4],
rand_flip=True,
is_train=True,
),
dict(
- type="GlobalRotScaleTrans",
+ type="BEVFusionGlobalRotScaleTrans",
rot_range=[-1.571, 1.571],
scale_ratio_range=[0.8, 1.2],
translation_std=[1.0, 1.0, 0.2],
@@ -259,23 +214,21 @@
test_pipeline = [
dict(
type="BEVLoadMultiViewImageFromFiles",
- data_root=data_root,
to_float32=True,
color_type="color",
backend_args=backend_args,
- test_mode=True,
),
dict(
type="LoadPointsFromFile",
coord_type="LIDAR",
- load_dim=5,
- use_dim=5,
+ load_dim=point_load_dim,
+ use_dim=point_load_dim,
backend_args=backend_args,
),
dict(
type="LoadPointsFromMultiSweeps",
sweeps_num=sweeps_num,
- load_dim=5,
+ load_dim=point_load_dim,
use_dim=lidar_sweep_dims,
pad_empty_sweeps=True,
remove_close=True,
@@ -285,7 +238,7 @@
dict(
type="ImageAug3D",
final_dim=image_size,
- resize_lim=[0.48, 0.48],
+ resize_lim=0.02,
bot_pct_lim=[0.0, 0.0],
rot_lim=[0.0, 0.0],
rand_flip=False,
@@ -313,6 +266,8 @@
),
]
+filter_cfg = dict(filter_frames_with_missing_image=True)
+
train_dataloader = dict(
batch_size=train_batch_size,
num_workers=num_workers,
@@ -320,20 +275,22 @@
sampler=dict(type="DefaultSampler", shuffle=True),
dataset=dict(
type=_base_.dataset_type,
+ pipeline=train_pipeline,
+ modality=input_modality,
+ backend_args=backend_args,
data_root=data_root,
ann_file=info_directory_path + _base_.info_train_file_name,
- pipeline=train_pipeline,
metainfo=_base_.metainfo,
class_names=_base_.class_names,
- modality=input_modality,
- data_prefix=_base_.data_prefix,
test_mode=False,
+ data_prefix=_base_.data_prefix,
box_type_3d="LiDAR",
- backend_args=backend_args,
+ filter_cfg=filter_cfg,
),
)
+
val_dataloader = dict(
- batch_size=2,
+ batch_size=test_batch_size,
num_workers=num_workers,
persistent_workers=True,
sampler=dict(type="DefaultSampler", shuffle=False),
@@ -351,8 +308,9 @@
backend_args=backend_args,
),
)
+
test_dataloader = dict(
- batch_size=2,
+ batch_size=test_batch_size,
num_workers=num_workers,
persistent_workers=True,
sampler=dict(type="DefaultSampler", shuffle=False),
@@ -392,6 +350,7 @@
name_mapping=_base_.name_mapping,
eval_class_range=eval_class_range,
filter_attributes=_base_.filter_attributes,
+ save_csv=True,
)
# learning rate
@@ -456,14 +415,8 @@
clip_grad=dict(max_norm=35, norm_type=2),
)
-# Default setting for scaling LR automatically
-# - `enable` means enable scaling LR automatically
-# or not by default.
-# - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
-# auto_scale_lr = dict(enable=False, base_batch_size=32)
-auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size)
+auto_scale_lr = dict(enable=True, base_batch_size=4)
+# Only set if the number of train_gpu_size more than 1
if train_gpu_size > 1:
sync_bn = "torch"
-
-randomness = dict(seed=0, diff_rank_seed=False, deterministic=True)
diff --git a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_4xb8_base.py b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_4xb8_base.py
index 7fd9c26f..dfce02c4 100644
--- a/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_4xb8_base.py
+++ b/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_4xb8_base.py
@@ -35,7 +35,7 @@
max_num_points = 10
max_voxels = [120000, 160000]
num_proposals = 500
-image_size = [256, 704]
+image_size = [384, 576] # height, width
num_workers = 32
lidar_sweep_dims = [0, 1, 2, 4] # x, y, z, time_lag
lidar_feature_dims = 4
@@ -94,14 +94,11 @@
in_channels=256,
out_channels=80,
image_size=image_size,
- feature_size=[32, 88],
- # xbound=[-54.0, 54.0, 0.3],
- # ybound=[-54.0, 54.0, 0.3],
+ feature_size=[48, 72],
xbound=[-122.4, 122.4, 0.68],
ybound=[-122.4, 122.4, 0.68],
zbound=[-10.0, 10.0, 20.0],
- # dbound=[1.0, 60.0, 0.5],
- dbound=[1.0, 166.2, 1.4],
+ dbound=[1.0, 134, 1.4],
downsample=2,
),
fusion_layer=dict(type="ConvFuser", in_channels=[80, 256], out_channels=256),
@@ -157,14 +154,14 @@
dict(
type="ImageAug3D",
final_dim=image_size,
- resize_lim=[0.38, 0.55],
+ resize_lim=0.02,
bot_pct_lim=[0.0, 0.0],
rot_lim=[-5.4, 5.4],
rand_flip=True,
is_train=True,
),
dict(
- type="GlobalRotScaleTrans",
+ type="BEVFusionGlobalRotScaleTrans",
rot_range=[-1.571, 1.571],
scale_ratio_range=[0.8, 1.2],
translation_std=[1.0, 1.0, 0.2],
@@ -240,7 +237,7 @@
dict(
type="ImageAug3D",
final_dim=image_size,
- resize_lim=[0.48, 0.48],
+ resize_lim=0.02,
bot_pct_lim=[0.0, 0.0],
rot_lim=[0.0, 0.0],
rand_flip=False,
@@ -421,8 +418,7 @@
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
-# auto_scale_lr = dict(enable=False, base_batch_size=32)
-auto_scale_lr = dict(enable=False, base_batch_size=train_gpu_size * train_batch_size)
+auto_scale_lr = dict(enable=True, base_batch_size=32)
# Only set if the number of train_gpu_size more than 1
if train_gpu_size > 1:
diff --git a/projects/BEVFusion/docs/BEVFusion-CL-offline/v2/base.md b/projects/BEVFusion/docs/BEVFusion-CL-offline/v2/base.md
new file mode 100644
index 00000000..6a68ca89
--- /dev/null
+++ b/projects/BEVFusion/docs/BEVFusion-CL-offline/v2/base.md
@@ -0,0 +1,41 @@
+# Deployed model for BEVFusion-CL base/2.X
+## Summary
+
+### Overview
+
+| Eval range: 120m | mAP | car | truck | bus | bicycle | pedestrian |
+| --------------------------------| ---- | ---- | ----- | ---- | ------- | ---------- |
+| BEVFusion-CL-offline base/2.0.0 | 77.8 | 87.30 | 61.60 | 85.90 | 73.20 | 80.90 |
+| BEVFusion-CL base/2.0.0 | 76.3 | 80.50 | 61.90 | 85.90 | 74.70 | 78.70 |
+
+
+## Release
+
+### BEVFusion-CL-offline base/2.0.0
+
+
+ The link of data and evaluation result
+
+- Model
+ - Training dataset: DB JPNTAXI v1.0 + DB JPNTAXI v2.0 + DB JPNTAXI v4.0 + DB GSM8 v1.0 + DB J6 v1.0 + DB J6 v2.0 + DB J6 v3.0 + DB J6 v5.0 + DB J6 Gen2 v1.0 + DB J6 Gen2 v2.0 + DB J6 Gen2 v4.0 + DB LargeBus v1.0 (total frames: 71,633)
+ - [Config file path](https://github.com/tier4/AWML/blob/50f35a8ae52c4892351be0c7aa5d260c1b310b7e/projects/BEVFusion/configs/t4dataset/BEVFusion-CL-offline/bevfusion_camera_lidar_offline_voxel_second_secfpn_4xb8_base.py)
+ - Training results [model-zoo]
+ - [logs.zip](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl-offline/t4base/v2.0.0/logs.zip)
+ - [checkpoint_best.pth](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl-offline/t4base/v2.0.0/best_NuScenes_metric_T4Metric_mAP_epoch_30.pth)
+ - [config.py](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl-offline/t4base/v2.0.0/bevfusion_camera_lidar_voxel_second_secfpn_2xb2_t4offline_no_intensity.py)
+ - Train time: NVIDIA H100 80GB * 4 * 50 epochs = 3 days and 20 hours
+ - Batch size: 4*5 = 20
+
+- Evaluation
+ - db_jpntaxi_v1 + db_jpntaxi_v2 + db_jpntaxi_v4 + db_gsm8_v1 + db_j6_v1 + db_j6_v2 + db_j6_v3 + db_j6_v5 + db_j6gen2_v1 + db_j6gen2_v1 + db_j6gen2_v4 + db_largebus_v1 (total frames: 5,703):
+ - Total mAP (eval range = 120m): 0.7503
+
+| class_name | Count | mAP | AP@0.5m | AP@1.0m | AP@2.0m | AP@4.0m |
+| ---- | ------- | ---- | ---- | ---- | ---- | ---- |
+| car | 144,001 | 87.3 | 77.5 | 87.8 | 91.6 | 92.2 |
+| truck | 20,823 | 61.6 | 41.0 | 61.3 | 69.0 | 74.9 |
+| bus | 5,691 | 85.9 | 75.6 | 85.6 | 90.3 | 92.2 |
+| bicycle | 5,007 | 73.2 | 71.4 | 73.5 | 73.7 | 74.1 |
+| pedestrian | 42,034 | 80.9 | 79.5 | 80.5 | 81.3 | 82.3 |
+
+
diff --git a/projects/BEVFusion/docs/BEVFusion-CL/v2/base.md b/projects/BEVFusion/docs/BEVFusion-CL/v2/base.md
index 0ef9f423..eaf2f9cb 100644
--- a/projects/BEVFusion/docs/BEVFusion-CL/v2/base.md
+++ b/projects/BEVFusion/docs/BEVFusion-CL/v2/base.md
@@ -7,7 +7,8 @@
| Eval range: 120m | mAP | car | truck | bus | bicycle | pedestrian |
| --------------------------------| ---- | ---- | ----- | ---- | ------- | ---------- |
| BEVFusion-CL base/2.0.0 (A) | 70.72 | 81.04 | **62.06** | 82.52 | **70.82** | 57.14 |
-| BEVFusion-CL base/2.0.0 (B) | **75.03** | 79.62 | 61.20 | **86.67** | 69.99 | **77.62** |
+| BEVFusion-CL base/2.0.0 (B) | **75.03** | 79.62 | 61.20 | **86.67** | 69.99 | **77.62** |
+| BEVFusion-CL base/2.0.0 (C) | **76.3** | **80.50** | **61.90** | 85.90 | **74.70** | **78.70** |
### Datasets
#### base
@@ -40,12 +41,47 @@
| BEVFusion-CL base/2.0.0 (A) | 69.99 | 79.41 | 64.64 | 83.58 | 67.03 | 55.28 |
| BEVFusion-CL base/2.0.0 (B) | 74.48 | 77.28 | 62.67 | 87.92 | 66.58 | 77.98 |
+- BEVFusion-CL base/2.0.0 (A): Without intensity and training pedestrians with pooling pedestrians
+- BEVFusion-CL base/2.0.0 (B): Same as `BEVFusion-CL base/2.0.0 (A)` without pooling pedestrians
+- BEVFusion-CL base/2.0.0 (C): Same as `BEVFusion-CL base/2.0.0 (B)` with improved image ROI cropping, and augmentation parameter fixes.
## Release
-### BEVFusion-CL base/2.0.0
-- BEVFusion-CL base/2.0.0 (A): Without intensity and training pedestrians with pooling pedestrians
-- BEVFusion-CL base/2.0.0 (B): Same as `BEVFusion-CL base/2.0.0 (A)` without pooling pedestrians
+### BEVFusion-CL base/2.0.0 (C)
+
+
+ The link of data and evaluation result
+
+- Model
+ - Training dataset: DB JPNTAXI v1.0 + DB JPNTAXI v2.0 + DB JPNTAXI v4.0 + DB GSM8 v1.0 + DB J6 v1.0 + DB J6 v2.0 + DB J6 v3.0 + DB J6 v5.0 + DB J6 Gen2 v1.0 + DB J6 Gen2 v2.0 + DB J6 Gen2 v4.0 + DB LargeBus v1.0 (total frames: 71,633)
+ - [Config file path](https://github.com/tier4/AWML/blob/50f35a8ae52c4892351be0c7aa5d260c1b310b7e/projects/BEVFusion/configs/t4dataset/BEVFusion-CL/bevfusion_camera_lidar_voxel_second_secfpn_4xb8_base.py)
+ - Deployed onnx model and ROS parameter files [[WebAuto (for internal)]](WIP)
+ - Deployed onnx and ROS parameter files [[model-zoo]]
+ - [image_backbone.onnx](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl/t4base/v2.0.0/image_backbone.onnx)
+ - [main_body.onnx](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl/t4base/v2.0.0/main_body.onnx)
+ - Training results [model-zoo]
+ - [logs.zip](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl/t4base/v2.0.0/log.zip)
+ - [checkpoint_best.pth](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl/t4base/v2.0.0/best_NuScenes_metric_T4Metric_mAP_epoch_48.pth)
+ - [config.py](https://download.autoware-ml-model-zoo.tier4.jp/autoware-ml/models/bevfusion/bevfusion-cl/t4base/v2.0.0/bevfusion_camera_lidar_voxel_second_secfpn_4xb8_base.py)
+ - [PR](https://github.com/tier4/AWML/pull/88)
+ - Train time: NVIDIA H100 80GB * 4 * 50 epochs = 3 days and 20 hours
+ - Batch size: 4*8 = 32
+
+- Evaluation
+ - db_jpntaxi_v1 + db_jpntaxi_v2 + db_jpntaxi_v4 + db_gsm8_v1 + db_j6_v1 + db_j6_v2 + db_j6_v3 + db_j6_v5 + db_j6gen2_v1 + db_j6gen2_v1 + db_j6gen2_v4 + db_largebus_v1 (total frames: 5,703):
+ - Total mAP (eval range = 120m): 0.763
+
+| class_name | Count | mAP | AP@0.5m | AP@1.0m | AP@2.0m | AP@4.0m |
+| ---- | ------- | ---- | ---- | ---- | ---- | ---- |
+| car | 144,001 | 80.5 | 69.2 | 80.5 | 85.1 | 87.2 |
+| truck | 20,823 | 61.9 | 37.7 | 60.9 | 71.1 | 78.1 |
+| bus | 5,691 | 85.9 | 71.9 | 86.0 | 92.1 | 93.5 |
+| bicycle | 5,007 | 74.7 | 71.2 | 75.4 | 75.9 | 76.4 |
+| pedestrian | 42,034 | 78.7 | 76.1 | 78.4 | 79.5 | 80.6 |
+
+
+
+### BEVFusion-CL base/2.0.0 (B)
- We report only `BEVFusion-CL base/2.0.0 (B)` since the performance is much better than `BEVFusion-CL base/2.0.0 (A)`, and it is mainly due to it doesn't downsample the dense heatmaps for pedestrians, and thus it has more queries