diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..70e40d6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.dist_test/
+*.avi
+ckpts/
+data/bench2drive/
+data/infos
+data_carla
+viz/
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# output & ckpts
+output/
+test/
+ckpts/
+ckpts
+
+# work_dirs
+**/work_dirs
+
+batchscript*
+phoenix*
+
+debug/
+*projs/
+
+INFO
+pyrightconfig.json
+.vscode/
+*.pth
+*.log
+tmp_ckpts/
+val/
+*.ipynb
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..59c5fd8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,52 @@
+
+
+
+
+
+Bench2DriveZoo
+
+
+
+
+
+
+# Introduction
+
+- We implement training and open-loop evaluation for [BEVFormer](https://github.com/fundamentalvision/BEVFormer), [UniAD](https://github.com/OpenDriveLab/UniAD) , [VAD](https://github.com/hustvl/VAD) on [Bench2Drive](https://github.com/Thinklab-SJTU/Bench2Drive) dataset.
+- We completed the closed-loop evaluation process in Carla for Uniad and VAD on Bench2Drive.
+- We simplified the code framework by merging multiple dependencies like mmcv, mmseg, mmdet, and mmdet3d into a single library, and support the latest version of pytorch(2.3.1), which greatly facilitating installation and development.
+
+
+
+# Getting Started
+
+- [Installation](docs/INSTALL.md)
+- [Prepare Dataset](docs/INSTALL.md)
+- [Train and Open-Loop Eval](docs/TRAIN_EVAL.md)
+- [Closed-Loop Eval in Carla](docs/EVAL_IN_CARLA.md)
+- [Convert Codes from Nuscenes to Bench2Drive](docs/CONVERT_GUIDE.md)
+
+# Results and Pre-trained Models
+
+## UniAD and VAD
+
+| Method | L2 (m) 2s | Driving Score | Success Rate(%) | Config | Download |
+| :---: | :---: | :---: | :---: | :---: |:---: |
+| UniAD-Tiny |0.80 | 32.00 | 9.54 | [config](adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/bevformer_tiny_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/1psr7AKYHD7CitZ30Bz-9sA?pwd=1234 )|
+| UniAD-Base |0.73 | 37.72 | 9.54 | [config](adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/uniad_base_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/11p9IUGqTax1f4W_qsdLCRw?pwd=1234) |
+| VAD |0.91 | 39.4 | 10.0 | [config](adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/vad_b2d_base.pth)/[Baidu Cloud]( https://pan.baidu.com/s/11p9IUGqTax1f4W_qsdLCRw?pwd=1234) |
+
+## BEVFormer
+
+| Method | mAP | NDS | Config | Download |
+| :---: | :---: | :---: | :---: | :---: |
+| BEVFormer-Tiny | 0.37 | 0.43 | [config](adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/bevformer_tiny_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/1TWMs9YgKYm2DF5YfXF8i3g?pwd=1234) |
+| BEVFormer-Base | 0.63 | 0.67 | [config](adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/bevformer_base_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/1Y4VkE1gc8BU0zJ4z2fmIkQ?pwd=1234) |
+
+
+# Related Resources
+
+- [Bench2Drive](https://github.com/Thinklab-SJTU/Bench2Drive)
+- [BEVFormer](https://github.com/fundamentalvision/BEVFormer)
+- [UniAD](https://github.com/OpenDriveLab/UniAD)
+- [VAD](https://github.com/hustvl/VAD)
diff --git a/adzoo/__init__.py b/adzoo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/bevformer/analysis_tools/__init__.py b/adzoo/bevformer/analysis_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/bevformer/analysis_tools/analyze_logs.py b/adzoo/bevformer/analysis_tools/analyze_logs.py
new file mode 100755
index 0000000..806175f
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+ for i, log_dict in enumerate(log_dicts):
+ print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+ all_times = []
+ for epoch in log_dict.keys():
+ if args.include_outliers:
+ all_times.append(log_dict[epoch]['time'])
+ else:
+ all_times.append(log_dict[epoch]['time'][1:])
+ all_times = np.array(all_times)
+ epoch_ave_time = all_times.mean(-1)
+ slowest_epoch = epoch_ave_time.argmax()
+ fastest_epoch = epoch_ave_time.argmin()
+ std_over_epoch = epoch_ave_time.std()
+ print(f'slowest epoch {slowest_epoch + 1}, '
+ f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+ print(f'fastest epoch {fastest_epoch + 1}, '
+ f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+ print(f'time std over epochs is {std_over_epoch:.4f}')
+ print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+ print()
+
+
+def plot_curve(log_dicts, args):
+ if args.backend is not None:
+ plt.switch_backend(args.backend)
+ sns.set_style(args.style)
+ # if legend is None, use {filename}_{key} as legend
+ legend = args.legend
+ if legend is None:
+ legend = []
+ for json_log in args.json_logs:
+ for metric in args.keys:
+ legend.append(f'{json_log}_{metric}')
+ assert len(legend) == (len(args.json_logs) * len(args.keys))
+ metrics = args.keys
+
+ num_metrics = len(metrics)
+ for i, log_dict in enumerate(log_dicts):
+ epochs = list(log_dict.keys())
+ for j, metric in enumerate(metrics):
+ print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+ if metric not in log_dict[epochs[args.interval - 1]]:
+ raise KeyError(
+ f'{args.json_logs[i]} does not contain metric {metric}')
+
+ if args.mode == 'eval':
+ if min(epochs) == args.interval:
+ x0 = args.interval
+ else:
+ # if current training is resumed from previous checkpoint
+ # we lost information in early epochs
+ # `xs` should start according to `min(epochs)`
+ if min(epochs) % args.interval == 0:
+ x0 = min(epochs)
+ else:
+ # find the first epoch that do eval
+ x0 = min(epochs) + args.interval - \
+ min(epochs) % args.interval
+ xs = np.arange(x0, max(epochs) + 1, args.interval)
+ ys = []
+ for epoch in epochs[args.interval - 1::args.interval]:
+ ys += log_dict[epoch][metric]
+
+ # if training is aborted before eval of the last epoch
+ # `xs` and `ys` will have different length and cause an error
+ # check if `ys[-1]` is empty here
+ if not log_dict[epoch][metric]:
+ xs = xs[:-1]
+
+ ax = plt.gca()
+ ax.set_xticks(xs)
+ plt.xlabel('epoch')
+ plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+ else:
+ xs = []
+ ys = []
+ num_iters_per_epoch = \
+ log_dict[epochs[args.interval-1]]['iter'][-1]
+ for epoch in epochs[args.interval - 1::args.interval]:
+ iters = log_dict[epoch]['iter']
+ if log_dict[epoch]['mode'][-1] == 'val':
+ iters = iters[:-1]
+ xs.append(
+ np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+ ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+ xs = np.concatenate(xs)
+ ys = np.concatenate(ys)
+ plt.xlabel('iter')
+ plt.plot(
+ xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+ plt.legend()
+ if args.title is not None:
+ plt.title(args.title)
+ if args.out is None:
+ plt.show()
+ else:
+ print(f'save curve to: {args.out}')
+ plt.savefig(args.out)
+ plt.cla()
+
+
+def add_plot_parser(subparsers):
+ parser_plt = subparsers.add_parser(
+ 'plot_curve', help='parser for plotting curves')
+ parser_plt.add_argument(
+ 'json_logs',
+ type=str,
+ nargs='+',
+ help='path of train log in json format')
+ parser_plt.add_argument(
+ '--keys',
+ type=str,
+ nargs='+',
+ default=['mAP_0.25'],
+ help='the metric that you want to plot')
+ parser_plt.add_argument('--title', type=str, help='title of figure')
+ parser_plt.add_argument(
+ '--legend',
+ type=str,
+ nargs='+',
+ default=None,
+ help='legend of each plot')
+ parser_plt.add_argument(
+ '--backend', type=str, default=None, help='backend of plt')
+ parser_plt.add_argument(
+ '--style', type=str, default='dark', help='style of plt')
+ parser_plt.add_argument('--out', type=str, default=None)
+ parser_plt.add_argument('--mode', type=str, default='train')
+ parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+ parser_time = subparsers.add_parser(
+ 'cal_train_time',
+ help='parser for computing the average time per training iteration')
+ parser_time.add_argument(
+ 'json_logs',
+ type=str,
+ nargs='+',
+ help='path of train log in json format')
+ parser_time.add_argument(
+ '--include-outliers',
+ action='store_true',
+ help='include the first value of every epoch when computing '
+ 'the average time')
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Analyze Json Log')
+ # currently only support plot curve and calculate average train time
+ subparsers = parser.add_subparsers(dest='task', help='task parser')
+ add_plot_parser(subparsers)
+ add_time_parser(subparsers)
+ args = parser.parse_args()
+ return args
+
+
+def load_json_logs(json_logs):
+ # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+ # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+ # value of sub dict is a list of corresponding values of all iterations
+ log_dicts = [dict() for _ in json_logs]
+ for json_log, log_dict in zip(json_logs, log_dicts):
+ with open(json_log, 'r') as log_file:
+ for line in log_file:
+ log = json.loads(line.strip())
+ # skip lines without `epoch` field
+ if 'epoch' not in log:
+ continue
+ epoch = log.pop('epoch')
+ if epoch not in log_dict:
+ log_dict[epoch] = defaultdict(list)
+ for k, v in log.items():
+ log_dict[epoch][k].append(v)
+ return log_dicts
+
+
+def main():
+ args = parse_args()
+
+ json_logs = args.json_logs
+ for json_log in json_logs:
+ assert json_log.endswith('.json')
+
+ log_dicts = load_json_logs(json_logs)
+
+ eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/analysis_tools/benchmark.py b/adzoo/bevformer/analysis_tools/benchmark.py
new file mode 100755
index 0000000..487a348
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/benchmark.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+ parser.add_argument('--samples', default=2000, help='samples to benchmark')
+ parser.add_argument(
+ '--log-interval', default=50, help='interval of logging')
+ parser.add_argument(
+ '--fuse-conv-bn',
+ action='store_true',
+ help='Whether to fuse conv and bn, this will slightly increase'
+ 'the inference speed')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+ cfg.model.pretrained = None
+ cfg.data.test.test_mode = True
+
+ # build the dataloader
+ # TODO: support multiple images per gpu (only minor changes are needed)
+ print(cfg.data.test)
+ dataset = custom_build_dataset(cfg.data.test)
+ data_loader = build_dataloader(
+ dataset,
+ samples_per_gpu=1,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=False,
+ shuffle=False)
+
+ # build the model and load checkpoint
+ cfg.model.train_cfg = None
+ model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(model)
+ if args.checkpoint is not None:
+ load_checkpoint(model, args.checkpoint, map_location='cpu')
+ #if args.fuse_conv_bn:
+ # model = fuse_module(model)
+
+ model = MMDataParallel(model, device_ids=[0])
+
+ model.eval()
+
+ # the first several iterations may be very slow so skip them
+ num_warmup = 5
+ pure_inf_time = 0
+
+ # benchmark with several samples and take the average
+ for i, data in enumerate(data_loader):
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ with torch.no_grad():
+ model(return_loss=False, rescale=True, **data)
+
+ torch.cuda.synchronize()
+ elapsed = time.perf_counter() - start_time
+
+ if i >= num_warmup:
+ pure_inf_time += elapsed
+ if (i + 1) % args.log_interval == 0:
+ fps = (i + 1 - num_warmup) / pure_inf_time
+ print(f'Done image [{i + 1:<3}/ {args.samples}], '
+ f'fps: {fps:.1f} img / s')
+
+ if (i + 1) == args.samples:
+ pure_inf_time += elapsed
+ fps = (i + 1 - num_warmup) / pure_inf_time
+ print(f'Overall fps: {fps:.1f} img / s')
+ break
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/analysis_tools/get_params.py b/adzoo/bevformer/analysis_tools/get_params.py
new file mode 100644
index 0000000..fb697ad
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/get_params.py
@@ -0,0 +1,10 @@
+import torch
+file_path = './ckpts/bevformer_v4.pth'
+model = torch.load(file_path, map_location='cpu')
+all = 0
+for key in list(model['state_dict'].keys()):
+ all += model['state_dict'][key].nelement()
+print(all)
+
+# smaller 63374123
+# v4 69140395
diff --git a/adzoo/bevformer/analysis_tools/visual.py b/adzoo/bevformer/analysis_tools/visual.py
new file mode 100644
index 0000000..f711b75
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/visual.py
@@ -0,0 +1,477 @@
+# Based on https://github.com/nutonomy/nuscenes-devkit
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+
+import mmcv
+from nuscenes.nuscenes import NuScenes
+from PIL import Image
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from typing import Tuple, List, Iterable
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from matplotlib import rcParams
+from matplotlib.axes import Axes
+from pyquaternion import Quaternion
+from PIL import Image
+from matplotlib import rcParams
+from matplotlib.axes import Axes
+from pyquaternion import Quaternion
+from tqdm import tqdm
+from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.detection.render import visualize_sample
+
+
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+import numpy as np
+import matplotlib.pyplot as plt
+from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box
+from PIL import Image
+from matplotlib import rcParams
+
+
+def render_annotation(
+ anntoken: str,
+ margin: float = 10,
+ view: np.ndarray = np.eye(4),
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ out_path: str = 'render.png',
+ extra_info: bool = False) -> None:
+ """
+ Render selected annotation.
+ :param anntoken: Sample_annotation token.
+ :param margin: How many meters in each direction to include in LIDAR view.
+ :param view: LIDAR view point.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param out_path: Optional path to save the rendered figure to disk.
+ :param extra_info: Whether to render extra information below camera view.
+ """
+ ann_record = nusc.get('sample_annotation', anntoken)
+ sample_record = nusc.get('sample', ann_record['sample_token'])
+ assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+ # Figure out which camera the object is fully visible in (this may return nothing).
+ boxes, cam = [], []
+ cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+ all_bboxes = []
+ select_cams = []
+ for cam in cams:
+ _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+ selected_anntokens=[anntoken])
+ if len(boxes) > 0:
+ all_bboxes.append(boxes)
+ select_cams.append(cam)
+ # We found an image that matches. Let's abort.
+ # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+ # 'Try using e.g. BoxVisibility.ANY.'
+ # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+ num_cam = len(all_bboxes)
+
+ fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+ select_cams = [sample_record['data'][cam] for cam in select_cams]
+ print('bbox in cams:', select_cams)
+ # Plot LIDAR view.
+ lidar = sample_record['data']['LIDAR_TOP']
+ data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+ LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+ for box in boxes:
+ c = np.array(get_color(box.name)) / 255.0
+ box.render(axes[0], view=view, colors=(c, c, c))
+ corners = view_points(boxes[0].corners(), view, False)[:2, :]
+ axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+ axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+ axes[0].axis('off')
+ axes[0].set_aspect('equal')
+
+ # Plot CAMERA view.
+ for i in range(1, num_cam + 1):
+ cam = select_cams[i - 1]
+ data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+ im = Image.open(data_path)
+ axes[i].imshow(im)
+ axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+ axes[i].axis('off')
+ axes[i].set_aspect('equal')
+ for box in boxes:
+ c = np.array(get_color(box.name)) / 255.0
+ box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+ # Print extra information about the annotation below the camera view.
+ axes[i].set_xlim(0, im.size[0])
+ axes[i].set_ylim(im.size[1], 0)
+
+ if extra_info:
+ rcParams['font.family'] = 'monospace'
+
+ w, l, h = ann_record['size']
+ category = ann_record['category_name']
+ lidar_points = ann_record['num_lidar_pts']
+ radar_points = ann_record['num_radar_pts']
+
+ sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+ pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+ dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+ information = ' \n'.join(['category: {}'.format(category),
+ '',
+ '# lidar points: {0:>4}'.format(lidar_points),
+ '# radar points: {0:>4}'.format(radar_points),
+ '',
+ 'distance: {:>7.3f}m'.format(dist),
+ '',
+ 'width: {:>7.3f}m'.format(w),
+ 'length: {:>7.3f}m'.format(l),
+ 'height: {:>7.3f}m'.format(h)])
+
+ plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+ if out_path is not None:
+ plt.savefig(out_path)
+
+
+
+def get_sample_data(sample_data_token: str,
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ selected_anntokens=None,
+ use_flat_vehicle_coordinates: bool = False):
+ """
+ Returns the data path as well as all annotations related to that sample_data.
+ Note that the boxes are transformed into the current sensor's coordinate frame.
+ :param sample_data_token: Sample_data token.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param selected_anntokens: If provided only return the selected annotation.
+ :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+ aligned to z-plane in the world.
+ :return: (data_path, boxes, camera_intrinsic )
+ """
+
+ # Retrieve sensor & pose records
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+ data_path = nusc.get_sample_data_path(sample_data_token)
+
+ if sensor_record['modality'] == 'camera':
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ else:
+ cam_intrinsic = None
+ imsize = None
+
+ # Retrieve all sample annotations and map to sensor coordinate system.
+ if selected_anntokens is not None:
+ boxes = list(map(nusc.get_box, selected_anntokens))
+ else:
+ boxes = nusc.get_boxes(sample_data_token)
+
+ # Make list of Box objects including coord system transforms.
+ box_list = []
+ for box in boxes:
+ if use_flat_vehicle_coordinates:
+ # Move box to ego vehicle coord system parallel to world z plane.
+ yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+ else:
+ # Move box to ego vehicle coord system.
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if sensor_record['modality'] == 'camera' and not \
+ box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+ continue
+
+ box_list.append(box)
+
+ return data_path, box_list, cam_intrinsic
+
+
+
+def get_predicted_data(sample_data_token: str,
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ selected_anntokens=None,
+ use_flat_vehicle_coordinates: bool = False,
+ pred_anns=None
+ ):
+ """
+ Returns the data path as well as all annotations related to that sample_data.
+ Note that the boxes are transformed into the current sensor's coordinate frame.
+ :param sample_data_token: Sample_data token.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param selected_anntokens: If provided only return the selected annotation.
+ :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+ aligned to z-plane in the world.
+ :return: (data_path, boxes, camera_intrinsic )
+ """
+
+ # Retrieve sensor & pose records
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+ data_path = nusc.get_sample_data_path(sample_data_token)
+
+ if sensor_record['modality'] == 'camera':
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ else:
+ cam_intrinsic = None
+ imsize = None
+
+ # Retrieve all sample annotations and map to sensor coordinate system.
+ # if selected_anntokens is not None:
+ # boxes = list(map(nusc.get_box, selected_anntokens))
+ # else:
+ # boxes = nusc.get_boxes(sample_data_token)
+ boxes = pred_anns
+ # Make list of Box objects including coord system transforms.
+ box_list = []
+ for box in boxes:
+ if use_flat_vehicle_coordinates:
+ # Move box to ego vehicle coord system parallel to world z plane.
+ yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+ else:
+ # Move box to ego vehicle coord system.
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if sensor_record['modality'] == 'camera' and not \
+ box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+ continue
+ box_list.append(box)
+
+ return data_path, box_list, cam_intrinsic
+
+
+
+
+def lidiar_render(sample_token, data,out_path=None):
+ bbox_gt_list = []
+ bbox_pred_list = []
+ anns = nusc.get('sample', sample_token)['anns']
+ for ann in anns:
+ content = nusc.get('sample_annotation', ann)
+ try:
+ bbox_gt_list.append(DetectionBox(
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=nusc.box_velocity(content['token'])[:2],
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=category_to_detection_name(content['category_name']),
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=''))
+ except:
+ pass
+
+ bbox_anns = data['results'][sample_token]
+ for content in bbox_anns:
+ bbox_pred_list.append(DetectionBox(
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name']))
+ gt_annotations = EvalBoxes()
+ pred_annotations = EvalBoxes()
+ gt_annotations.add_boxes(sample_token, bbox_gt_list)
+ pred_annotations.add_boxes(sample_token, bbox_pred_list)
+ print('green is ground truth')
+ print('blue is the predited result')
+ visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, savepath=out_path+'_bev')
+
+
+def get_color(category_name: str):
+ """
+ Provides the default colors based on the category names.
+ This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+ """
+ a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+ 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+ 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+ 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+ 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+ 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+ 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+ 'vehicle.ego']
+ class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+ ]
+ #print(category_name)
+ if category_name == 'bicycle':
+ return nusc.colormap['vehicle.bicycle']
+ elif category_name == 'construction_vehicle':
+ return nusc.colormap['vehicle.construction']
+ elif category_name == 'traffic_cone':
+ return nusc.colormap['movable_object.trafficcone']
+
+ for key in nusc.colormap.keys():
+ if category_name in key:
+ return nusc.colormap[key]
+ return [0, 0, 0]
+
+
+def render_sample_data(
+ sample_toekn: str,
+ with_anns: bool = True,
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ axes_limit: float = 40,
+ ax=None,
+ nsweeps: int = 1,
+ out_path: str = None,
+ underlay_map: bool = True,
+ use_flat_vehicle_coordinates: bool = True,
+ show_lidarseg: bool = False,
+ show_lidarseg_legend: bool = False,
+ filter_lidarseg_labels=None,
+ lidarseg_preds_bin_path: str = None,
+ verbose: bool = True,
+ show_panoptic: bool = False,
+ pred_data=None,
+ ) -> None:
+ """
+ Render sample data onto axis.
+ :param sample_data_token: Sample_data token.
+ :param with_anns: Whether to draw box annotations.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param axes_limit: Axes limit for lidar and radar (measured in meters).
+ :param ax: Axes onto which to render.
+ :param nsweeps: Number of sweeps for lidar and radar.
+ :param out_path: Optional path to save the rendered figure to disk.
+ :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+ :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+ aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+ can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+ setting is more correct and rotates the plot by ~90 degrees.
+ :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+ to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+ :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+ :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+ or the list is empty, all classes will be displayed.
+ :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+ predictions for the sample.
+ :param verbose: Whether to display the image after it is rendered.
+ :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+ to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+ If show_lidarseg is True, show_panoptic will be set to False.
+ """
+ lidiar_render(sample_toekn, pred_data, out_path=out_path)
+ sample = nusc.get('sample', sample_toekn)
+ # sample = data['results'][sample_token_list[0]][0]
+ cams = [
+ 'CAM_FRONT_LEFT',
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_RIGHT',
+ ]
+ if ax is None:
+ _, ax = plt.subplots(4, 3, figsize=(24, 18))
+ j = 0
+ for ind, cam in enumerate(cams):
+ sample_data_token = sample['data'][cam]
+
+ sd_record = nusc.get('sample_data', sample_data_token)
+ sensor_modality = sd_record['sensor_modality']
+
+ if sensor_modality in ['lidar', 'radar']:
+ assert False
+ elif sensor_modality == 'camera':
+ # Load boxes and image.
+ boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+ name=record['detection_name'], token='predicted') for record in
+ pred_data['results'][sample_toekn] if record['detection_score'] > 0.2]
+
+ data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+ box_vis_level=box_vis_level, pred_anns=boxes)
+ _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level)
+ if ind == 3:
+ j += 1
+ ind = ind % 3
+ data = Image.open(data_path)
+ # mmcv.imwrite(np.array(data)[:,:,::-1], f'{cam}.png')
+ # Init axes.
+
+ # Show image.
+ ax[j, ind].imshow(data)
+ ax[j + 2, ind].imshow(data)
+
+ # Show boxes.
+ if with_anns:
+ for box in boxes_pred:
+ c = np.array(get_color(box.name)) / 255.0
+ box.render(ax[j, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+ for box in boxes_gt:
+ c = np.array(get_color(box.name)) / 255.0
+ box.render(ax[j + 2, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+ # Limit visible range.
+ ax[j, ind].set_xlim(0, data.size[0])
+ ax[j, ind].set_ylim(data.size[1], 0)
+ ax[j + 2, ind].set_xlim(0, data.size[0])
+ ax[j + 2, ind].set_ylim(data.size[1], 0)
+
+ else:
+ raise ValueError("Error: Unknown sensor modality!")
+
+ ax[j, ind].axis('off')
+ ax[j, ind].set_title('PRED: {} {labels_type}'.format(
+ sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else ''))
+ ax[j, ind].set_aspect('equal')
+
+ ax[j + 2, ind].axis('off')
+ ax[j + 2, ind].set_title('GT:{} {labels_type}'.format(
+ sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else ''))
+ ax[j + 2, ind].set_aspect('equal')
+
+ if out_path is not None:
+ plt.savefig(out_path+'_camera', bbox_inches='tight', pad_inches=0, dpi=200)
+ if verbose:
+ plt.show()
+ plt.close()
+
+if __name__ == '__main__':
+ nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+ # render_annotation('7603b030b42a4b1caa8c443ccc1a7d52')
+ bevformer_results = mmcv.load('test/bevformer_base/Thu_Jun__9_16_22_37_2022/pts_bbox/results_nusc.json')
+ sample_token_list = list(bevformer_results['results'].keys())
+ for id in range(0, 10):
+ render_sample_data(sample_token_list[id], pred_data=bevformer_results, out_path=sample_token_list[id])
diff --git a/adzoo/bevformer/apis/__init__.py b/adzoo/bevformer/apis/__init__.py
new file mode 100644
index 0000000..15520b2
--- /dev/null
+++ b/adzoo/bevformer/apis/__init__.py
@@ -0,0 +1,2 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
\ No newline at end of file
diff --git a/adzoo/bevformer/apis/mmdet_train.py b/adzoo/bevformer/apis/mmdet_train.py
new file mode 100644
index 0000000..3372f16
--- /dev/null
+++ b/adzoo/bevformer/apis/mmdet_train.py
@@ -0,0 +1,193 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.runner import HOOKS, DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_runner
+from mmcv.utils import build_from_cfg, get_root_logger
+
+from mmcv.core import EvalHook
+from mmcv.optims import build_optimizer
+from mmcv.datasets import build_dataset, replace_ImageToTensor
+import time
+import os.path as osp
+from mmcv.datasets.builder import build_dataloader
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+
+def custom_train_detector(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ eval_model=None,
+ meta=None):
+ logger = get_root_logger(cfg.log_level)
+
+ # prepare data loaders
+
+ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+ #assert len(dataset)==1s
+ if 'imgs_per_gpu' in cfg.data:
+ logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+ 'Please use "samples_per_gpu" instead')
+ if 'samples_per_gpu' in cfg.data:
+ logger.warning(
+ f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+ f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+ f'={cfg.data.imgs_per_gpu} is used in this experiments')
+ else:
+ logger.warning(
+ 'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+ f'{cfg.data.imgs_per_gpu} in this experiments')
+ cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+ data_loaders = [
+ build_dataloader(
+ ds,
+ cfg.data.samples_per_gpu,
+ cfg.data.workers_per_gpu,
+ # cfg.gpus will be ignored if distributed
+ len(cfg.gpu_ids),
+ dist=distributed,
+ seed=cfg.seed,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ ) for ds in dataset
+ ]
+ # put model on gpus
+ if distributed:
+ find_unused_parameters = cfg.get('find_unused_parameters', False)
+ # Sets the `find_unused_parameters` parameter in
+ # torch.nn.parallel.DistributedDataParallel
+ model = DistributedDataParallel(
+ model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters)
+ if eval_model is not None:
+ eval_model = DistributedDataParallel(
+ eval_model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters)
+ else:
+ model = DataParallel(
+ model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+ if eval_model is not None:
+ eval_model = DataParallel(
+ eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+ # build runner
+ optimizer = build_optimizer(model, cfg.optimizer)
+
+ if 'runner' not in cfg:
+ cfg.runner = {
+ 'type': 'EpochBasedRunner',
+ 'max_epochs': cfg.total_epochs
+ }
+ warnings.warn(
+ 'config is now expected to have a `runner` section, '
+ 'please set `runner` in your config.', UserWarning)
+ else:
+ if 'total_epochs' in cfg:
+ assert cfg.total_epochs == cfg.runner.max_epochs
+ if eval_model is not None:
+ runner = build_runner(
+ cfg.runner,
+ default_args=dict(
+ model=model,
+ eval_model=eval_model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+ else:
+ runner = build_runner(
+ cfg.runner,
+ default_args=dict(
+ model=model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+
+ # an ugly workaround to make .log and .log.json filenames the same
+ runner.timestamp = timestamp
+
+ # fp16 setting
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ optimizer_config = Fp16OptimizerHook(
+ **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+ elif distributed and 'type' not in cfg.optimizer_config:
+ optimizer_config = OptimizerHook(**cfg.optimizer_config)
+ else:
+ optimizer_config = cfg.optimizer_config
+
+ # register hooks
+ runner.register_training_hooks(cfg.lr_config, optimizer_config,
+ cfg.checkpoint_config, cfg.log_config,
+ cfg.get('momentum_config', None))
+
+ # register profiler hook
+ #trace_config = dict(type='tb_trace', dir_name='work_dir')
+ #profiler_config = dict(on_trace_ready=trace_config)
+ #runner.register_profiler_hook(profiler_config)
+
+ if distributed:
+ if isinstance(runner, EpochBasedRunner):
+ runner.register_hook(DistSamplerSeedHook())
+
+ # register eval hooks
+ if validate:
+ # Support batch_size > 1 in validation
+ val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+ if val_samples_per_gpu > 1:
+ assert False
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.val.pipeline = replace_ImageToTensor(
+ cfg.data.val.pipeline)
+ val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+ val_dataloader = build_dataloader(
+ val_dataset,
+ samples_per_gpu=val_samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ )
+ eval_cfg = cfg.get('evaluation', {})
+ eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+ eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+ eval_hook = CustomDistEvalHook if distributed else EvalHook
+ runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+ # user-defined hooks
+ if cfg.get('custom_hooks', None):
+ custom_hooks = cfg.custom_hooks
+ assert isinstance(custom_hooks, list), \
+ f'custom_hooks expect list type, but got {type(custom_hooks)}'
+ for hook_cfg in cfg.custom_hooks:
+ assert isinstance(hook_cfg, dict), \
+ 'Each item in custom_hooks expects dict type, but got ' \
+ f'{type(hook_cfg)}'
+ hook_cfg = hook_cfg.copy()
+ priority = hook_cfg.pop('priority', 'NORMAL')
+ hook = build_from_cfg(hook_cfg, HOOKS)
+ runner.register_hook(hook, priority=priority)
+ if cfg.resume_from:
+ runner.resume(cfg.resume_from)
+ elif cfg.load_from:
+ runner.load_checkpoint(cfg.load_from)
+ runner.run(data_loaders, cfg.workflow)
+
diff --git a/adzoo/bevformer/apis/test.py b/adzoo/bevformer/apis/test.py
new file mode 100644
index 0000000..7667395
--- /dev/null
+++ b/adzoo/bevformer/apis/test.py
@@ -0,0 +1,163 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.utils import get_dist_info
+
+from mmcv.core import encode_mask_results
+from mmcv.fileio.io import dump, load
+from mmcv.utils import mkdir_or_exist, ProgressBar
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+ """Encode bitmap mask to RLE code. Semantic Masks only
+ Args:
+ mask_results (list | tuple[list]): bitmap mask results.
+ In mask scoring rcnn, mask_results is a tuple of (segm_results,
+ segm_cls_score).
+ Returns:
+ list | tuple: RLE encoded mask.
+ """
+ cls_segms = mask_results
+ num_classes = len(cls_segms)
+ encoded_mask_results = []
+ for i in range(len(cls_segms)):
+ encoded_mask_results.append(
+ mask_util.encode(
+ np.array(
+ cls_segms[i][:, :, np.newaxis], order='F',
+ dtype='uint8'))[0]) # encoded with RLE
+ return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+ """Test model with multiple gpus.
+ This method tests model with multiple gpus and collects the results
+ under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+ it encodes results to gpu tensors and use gpu communication for results
+ collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+ and collects them by the rank 0 worker.
+ Args:
+ model (nn.Module): Model to be tested.
+ data_loader (nn.Dataloader): Pytorch data loader.
+ tmpdir (str): Path of directory to save the temporary results from
+ different gpus under cpu mode.
+ gpu_collect (bool): Option to use either gpu or cpu to collect results.
+ Returns:
+ list: The prediction results.
+ """
+ model.eval()
+ bbox_results = []
+ mask_results = []
+ dataset = data_loader.dataset
+ rank, world_size = get_dist_info()
+ if rank == 0:
+ prog_bar = ProgressBar(len(dataset))
+ time.sleep(2) # This line can prevent deadlock problem in some cases.
+ have_mask = False
+ for i, data in enumerate(data_loader):
+ with torch.no_grad():
+ result = model(data, return_loss=False, rescale=True)
+ # encode mask results
+ if isinstance(result, dict):
+ if 'bbox_results' in result.keys():
+ bbox_result = result['bbox_results']
+ batch_size = len(result['bbox_results'])
+ bbox_results.extend(bbox_result)
+ if 'mask_results' in result.keys() and result['mask_results'] is not None:
+ mask_result = custom_encode_mask_results(result['mask_results'])
+ mask_results.extend(mask_result)
+ have_mask = True
+ else:
+ batch_size = len(result)
+ bbox_results.extend(result)
+
+ #if isinstance(result[0], tuple):
+ # assert False, 'this code is for instance segmentation, which our code will not utilize.'
+ # result = [(bbox_results, encode_mask_results(mask_results))
+ # for bbox_results, mask_results in result]
+ if rank == 0:
+
+ for _ in range(batch_size * world_size):
+ prog_bar.update()
+
+ # collect results from all ranks
+ if gpu_collect:
+ bbox_results = collect_results_gpu(bbox_results, len(dataset))
+ if have_mask:
+ mask_results = collect_results_gpu(mask_results, len(dataset))
+ else:
+ mask_results = None
+ else:
+ bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+ tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+ if have_mask:
+ mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+ else:
+ mask_results = None
+
+ if mask_results is None:
+ return bbox_results
+ return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+ rank, world_size = get_dist_info()
+ # create a tmp dir if it is not specified
+ if tmpdir is None:
+ MAX_LEN = 512
+ # 32 is whitespace
+ dir_tensor = torch.full((MAX_LEN, ),
+ 32,
+ dtype=torch.uint8,
+ device='cuda')
+ if rank == 0:
+ mkdir_or_exist('.dist_test')
+ tmpdir = tempfile.mkdtemp(dir='.dist_test')
+ tmpdir = torch.tensor(
+ bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+ dir_tensor[:len(tmpdir)] = tmpdir
+ dist.broadcast(dir_tensor, 0)
+ tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+ else:
+ mkdir_or_exist(tmpdir)
+ # dump the part result to the dir
+ dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+ dist.barrier()
+ # collect all parts
+ if rank != 0:
+ return None
+ else:
+ # load results of all parts from tmp dir
+ part_list = []
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ part_list.append(load(part_file))
+ # sort the results
+ ordered_results = []
+ '''
+ bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+ '''
+ #for res in zip(*part_list):
+ for res in part_list:
+ ordered_results.extend(list(res))
+ # the dataloader may pad some samples
+ ordered_results = ordered_results[:size]
+ # remove tmp dir
+ shutil.rmtree(tmpdir)
+ return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+ collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/adzoo/bevformer/apis/train.py b/adzoo/bevformer/apis/train.py
new file mode 100644
index 0000000..dcae402
--- /dev/null
+++ b/adzoo/bevformer/apis/train.py
@@ -0,0 +1,65 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+
+def custom_train_model(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ eval_model=None,
+ meta=None):
+ """A function wrapper for launching model training according to cfg.
+
+ Because we need different eval_hook in runner. Should be deprecated in the
+ future.
+ """
+ if cfg.model.type in ['EncoderDecoder3D']:
+ assert False
+ else:
+ custom_train_detector(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ eval_model=eval_model,
+ meta=meta)
+
+
+def train_model(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ meta=None):
+ """A function wrapper for launching model training according to cfg.
+
+ Because we need different eval_hook in runner. Should be deprecated in the
+ future.
+ """
+ if cfg.model.type in ['EncoderDecoder3D']:
+ train_segmentor(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ meta=meta)
+ else:
+ train_detector(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ meta=meta)
diff --git a/adzoo/bevformer/configs/_base_/datasets/coco_instance.py b/adzoo/bevformer/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000..f6ea4f4
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/instances_train2017.json',
+ img_prefix=data_root + 'train2017/',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/instances_val2017.json',
+ img_prefix=data_root + 'val2017/',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/instances_val2017.json',
+ img_prefix=data_root + 'val2017/',
+ pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/bevformer/configs/_base_/datasets/kitti-3d-3class.py b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000..1822af4
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,140 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'kitti_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(
+ filter_by_difficulty=[-1],
+ filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+ classes=class_names,
+ sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='ObjectNoise',
+ num_try=100,
+ translation_std=[1.0, 1.0, 0.5],
+ global_rot_range=[0.0, 0.0],
+ rot_range=[-0.78539816, 0.78539816]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=6,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_train.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/kitti-3d-car.py b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000..1e81226
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,138 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'kitti_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+ classes=class_names,
+ sample_groups=dict(Car=15))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='ObjectNoise',
+ num_try=100,
+ translation_std=[1.0, 1.0, 0.5],
+ global_rot_range=[0.0, 0.0],
+ rot_range=[-0.78539816, 0.78539816]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=6,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_train.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/lyft-3d.py b/adzoo/bevformer/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000..71baff0
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+ 'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/lyft/': 's3://lyft/lyft/',
+# 'data/lyft/': 's3://lyft/lyft/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_test.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/nuim_instance.py b/adzoo/bevformer/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000..82fce56
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,59 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+ dict(
+ type='Resize',
+ img_scale=[(1280, 720), (1920, 1080)],
+ multiscale_mode='range',
+ keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1600, 900),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/bevformer/configs/_base_/datasets/nus-3d.py b/adzoo/bevformer/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000..1548171
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/nus-mono3d.py b/adzoo/bevformer/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000..1363a94
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'CustomNuScenesMonoDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox=True,
+ with_label=True,
+ with_attr_label=True,
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_bbox_depth=True),
+ dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='Collect3D',
+ keys=[
+ 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+ 'gt_labels_3d', 'centers2d', 'depths'
+ ]),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='MultiScaleFlipAug',
+ scale_factor=1.0,
+ flip=False,
+ transforms=[
+ dict(type='RandomFlip3D'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['img']),
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['img'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=train_pipeline,
+ modality=input_modality,
+ test_mode=False,
+ box_type_3d='Camera'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='Camera'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='Camera'))
+evaluation = dict(interval=2)
diff --git a/adzoo/bevformer/configs/_base_/datasets/range100_lyft-3d.py b/adzoo/bevformer/configs/_base_/datasets/range100_lyft-3d.py
new file mode 100644
index 0000000..efa63ea
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/range100_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+ 'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/lyft/': 's3://lyft/lyft/',
+# 'data/lyft/': 's3://lyft/lyft/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_test.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/s3dis-3d-5class.py b/adzoo/bevformer/configs/_base_/datasets/s3dis-3d-5class.py
new file mode 100644
index 0000000..2422766
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/s3dis-3d-5class.py
@@ -0,0 +1,114 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = './data/s3dis/'
+class_names = ('table', 'chair', 'sofa', 'bookcase', 'board')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ # following ScanNet dataset the rotation range is 5 degrees
+ rot_range=[-0.087266, 0.087266],
+ scale_ratio_range=[1.0, 1.0],
+ shift_height=True),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type='ConcatDataset',
+ datasets=[
+ dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + f's3dis_infos_Area_{i}.pkl',
+ pipeline=train_pipeline,
+ filter_empty_gt=False,
+ classes=class_names,
+ box_type_3d='Depth') for i in train_area
+ ],
+ separate_eval=False)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/s3dis_seg-3d-13class.py b/adzoo/bevformer/configs/_base_/datasets/s3dis_seg-3d-13class.py
new file mode 100644
index 0000000..39bf556
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/s3dis_seg-3d-13class.py
@@ -0,0 +1,139 @@
+# dataset settings
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+ 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=tuple(range(len(class_names))),
+ max_cat_id=13),
+ dict(
+ type='IndoorPatchPointSample',
+ num_points=num_points,
+ block_size=1.0,
+ ignore_index=len(class_names),
+ use_normalized_coord=True,
+ enlarge_size=0.2,
+ min_unique_num=None),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(
+ # a wrapper in order to successfully call test function
+ # actually we don't perform test-time-aug
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.0,
+ flip_ratio_bev_vertical=0.0),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=tuple(range(len(class_names))),
+ max_cat_id=13),
+ dict(
+ type='DefaultFormatBundle3D',
+ with_label=False,
+ class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ # train on area 1, 2, 3, 4, 6
+ # test on area 5
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_files=[
+ data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
+ ],
+ pipeline=train_pipeline,
+ classes=class_names,
+ test_mode=False,
+ ignore_index=len(class_names),
+ scene_idxs=[
+ data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
+ for i in train_area
+ ]),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names),
+ scene_idxs=data_root +
+ f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/scannet-3d-18class.py b/adzoo/bevformer/configs/_base_/datasets/scannet-3d-18class.py
new file mode 100644
index 0000000..93da1e5
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/scannet-3d-18class.py
@@ -0,0 +1,128 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+ 'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+ 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+ 'garbagebin')
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_mask_3d=True,
+ with_seg_3d=True),
+ dict(type='GlobalAlignment', rotation_axis=2),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+ 36, 39),
+ max_cat_id=40),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.087266, 0.087266],
+ scale_ratio_range=[1.0, 1.0],
+ shift_height=True),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='Collect3D',
+ keys=[
+ 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+ 'pts_instance_mask'
+ ])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(type='GlobalAlignment', rotation_axis=2),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(type='GlobalAlignment', rotation_axis=2),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_train.pkl',
+ pipeline=train_pipeline,
+ filter_empty_gt=False,
+ classes=class_names,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='Depth')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/scannet_seg-3d-20class.py b/adzoo/bevformer/configs/_base_/datasets/scannet_seg-3d-20class.py
new file mode 100644
index 0000000..cf73b09
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/scannet_seg-3d-20class.py
@@ -0,0 +1,132 @@
+# dataset settings
+dataset_type = 'ScanNetSegDataset'
+data_root = './data/scannet/'
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+ 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+ 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+ 'bathtub', 'otherfurniture')
+num_points = 8192
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+ 33, 34, 36, 39),
+ max_cat_id=40),
+ dict(
+ type='IndoorPatchPointSample',
+ num_points=num_points,
+ block_size=1.5,
+ ignore_index=len(class_names),
+ use_normalized_coord=False,
+ enlarge_size=0.2,
+ min_unique_num=None),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(
+ # a wrapper in order to successfully call test function
+ # actually we don't perform test-time-aug
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.0,
+ flip_ratio_bev_vertical=0.0),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+ 33, 34, 36, 39),
+ max_cat_id=40),
+ dict(
+ type='DefaultFormatBundle3D',
+ with_label=False,
+ class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ test_mode=False,
+ ignore_index=len(class_names),
+ scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names)),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/sunrgbd-3d-10class.py b/adzoo/bevformer/configs/_base_/datasets/sunrgbd-3d-10class.py
new file mode 100644
index 0000000..7121b75
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -0,0 +1,107 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+ 'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(type='LoadAnnotations3D'),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ ),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.523599, 0.523599],
+ scale_ratio_range=[0.85, 1.15],
+ shift_height=True),
+ dict(type='PointSample', num_points=20000),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ ),
+ dict(type='PointSample', num_points=20000),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'sunrgbd_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ filter_empty_gt=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='Depth')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'sunrgbd_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'sunrgbd_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-3class.py b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000..920ac15
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'LidarWaymoDataset'
+data_root = 'data/waymo-full/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'waymo_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(
+ filter_by_difficulty=[-1],
+ filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+ classes=class_names,
+ sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+ points_loader=dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=[0, 1, 2, 3, 4],
+ file_client_args=file_client_args))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_train.pkl',
+ split='training',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ # load one frame every five frames
+ load_interval=5)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-car.py b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000..02e2627
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,143 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'waymo_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+ classes=class_names,
+ sample_groups=dict(Car=15),
+ points_loader=dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=[0, 1, 2, 3, 4],
+ file_client_args=file_client_args))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_train.pkl',
+ split='training',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ # load one frame every five frames
+ load_interval=5)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/default_runtime.py b/adzoo/bevformer/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..4e85b69
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/adzoo/bevformer/configs/_base_/models/3dssd.py b/adzoo/bevformer/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000..55344c7
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/3dssd.py
@@ -0,0 +1,77 @@
+model = dict(
+ type='SSD3DNet',
+ backbone=dict(
+ type='PointNet2SAMSG',
+ in_channels=4,
+ num_points=(4096, 512, (256, 256)),
+ radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+ num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+ sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+ ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+ ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+ aggregation_channels=(64, 128, 256),
+ fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+ fps_sample_range_lists=((-1), (-1), (512, -1)),
+ norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+ sa_cfg=dict(
+ type='PointSAModuleMSG',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False)),
+ bbox_head=dict(
+ type='SSD3DHead',
+ in_channels=256,
+ vote_module_cfg=dict(
+ in_channels=256,
+ num_points=256,
+ gt_per_seed=1,
+ conv_channels=(128, ),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+ with_res_feat=False,
+ vote_xyz_range=(3.0, 3.0, 2.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModuleMSG',
+ num_point=256,
+ radii=(4.8, 6.4),
+ sample_nums=(16, 32),
+ mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+ norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+ use_xyz=True,
+ normalize_xyz=False,
+ bias=True),
+ pred_layer_cfg=dict(
+ in_channels=1536,
+ shared_conv_channels=(512, 128),
+ cls_conv_channels=(128, ),
+ reg_conv_channels=(128, ),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+ bias=True),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ loss_weight=1.0),
+ center_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ corner_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(
+ sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+ test_cfg=dict(
+ nms_cfg=dict(type='nms', iou_thr=0.1),
+ sample_mod='spec',
+ score_thr=0.0,
+ per_class_proposal=True,
+ max_output_num=100))
diff --git a/adzoo/bevformer/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/adzoo/bevformer/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..fb9e0a8
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+ type='CascadeRCNN',
+ pretrained='torchvision://resnet50',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+ roi_head=dict(
+ type='CascadeRoIHead',
+ num_stages=3,
+ stage_loss_weights=[1, 0.5, 0.25],
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=[
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+ ],
+ mask_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ mask_head=dict(
+ type='FCNMaskHead',
+ num_convs=4,
+ in_channels=256,
+ conv_out_channels=256,
+ num_classes=80,
+ loss_mask=dict(
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=2000,
+ max_num=2000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=[
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.6,
+ min_pos_iou=0.6,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.7,
+ min_pos_iou=0.7,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False)
+ ]),
+ test_cfg=dict(
+ rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100,
+ mask_thr_binary=0.5)))
diff --git a/adzoo/bevformer/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/adzoo/bevformer/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
new file mode 100644
index 0000000..efdce59
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+ type='CenterPoint',
+ pts_voxel_layer=dict(
+ max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
+ pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+ pts_middle_encoder=dict(
+ type='SparseEncoder',
+ in_channels=5,
+ sparse_shape=[41, 1024, 1024],
+ output_channels=128,
+ order=('conv', 'norm', 'act'),
+ encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+ 128)),
+ encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+ block_type='basicblock'),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=256,
+ out_channels=[128, 256],
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ conv_cfg=dict(type='Conv2d', bias=False)),
+ pts_neck=dict(
+ type='SECONDFPN',
+ in_channels=[128, 256],
+ out_channels=[256, 256],
+ upsample_strides=[1, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ upsample_cfg=dict(type='deconv', bias=False),
+ use_conv_for_no_stride=True),
+ pts_bbox_head=dict(
+ type='CenterHead',
+ in_channels=sum([256, 256]),
+ tasks=[
+ dict(num_class=1, class_names=['car']),
+ dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+ dict(num_class=2, class_names=['bus', 'trailer']),
+ dict(num_class=1, class_names=['barrier']),
+ dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+ dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+ ],
+ common_heads=dict(
+ reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+ share_conv_channel=64,
+ bbox_coder=dict(
+ type='CenterPointBBoxCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_num=500,
+ score_threshold=0.1,
+ out_size_factor=8,
+ voxel_size=voxel_size[:2],
+ code_size=9),
+ separate_head=dict(
+ type='SeparateHead', init_bias=-2.19, final_kernel=3),
+ loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+ loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+ norm_bbox=True),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[1024, 1024, 40],
+ voxel_size=voxel_size,
+ out_size_factor=8,
+ dense_reg=1,
+ gaussian_overlap=0.1,
+ max_objs=500,
+ min_radius=2,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+ test_cfg=dict(
+ pts=dict(
+ post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_per_img=500,
+ max_pool_nms=False,
+ min_radius=[4, 12, 10, 1, 0.85, 0.175],
+ score_threshold=0.1,
+ out_size_factor=8,
+ voxel_size=voxel_size[:2],
+ nms_type='rotate',
+ pre_max_size=1000,
+ post_max_size=83,
+ nms_thr=0.2)))
diff --git a/adzoo/bevformer/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/adzoo/bevformer/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
new file mode 100644
index 0000000..311d763
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+ type='CenterPoint',
+ pts_voxel_layer=dict(
+ max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
+ pts_voxel_encoder=dict(
+ type='PillarFeatureNet',
+ in_channels=5,
+ feat_channels=[64],
+ with_distance=False,
+ voxel_size=(0.2, 0.2, 8),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+ legacy=False),
+ pts_middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ out_channels=[64, 128, 256],
+ layer_nums=[3, 5, 5],
+ layer_strides=[2, 2, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ conv_cfg=dict(type='Conv2d', bias=False)),
+ pts_neck=dict(
+ type='SECONDFPN',
+ in_channels=[64, 128, 256],
+ out_channels=[128, 128, 128],
+ upsample_strides=[0.5, 1, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ upsample_cfg=dict(type='deconv', bias=False),
+ use_conv_for_no_stride=True),
+ pts_bbox_head=dict(
+ type='CenterHead',
+ in_channels=sum([128, 128, 128]),
+ tasks=[
+ dict(num_class=1, class_names=['car']),
+ dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+ dict(num_class=2, class_names=['bus', 'trailer']),
+ dict(num_class=1, class_names=['barrier']),
+ dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+ dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+ ],
+ common_heads=dict(
+ reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+ share_conv_channel=64,
+ bbox_coder=dict(
+ type='CenterPointBBoxCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_num=500,
+ score_threshold=0.1,
+ out_size_factor=4,
+ voxel_size=voxel_size[:2],
+ code_size=9),
+ separate_head=dict(
+ type='SeparateHead', init_bias=-2.19, final_kernel=3),
+ loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+ loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+ norm_bbox=True),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ out_size_factor=4,
+ dense_reg=1,
+ gaussian_overlap=0.1,
+ max_objs=500,
+ min_radius=2,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+ test_cfg=dict(
+ pts=dict(
+ post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_per_img=500,
+ max_pool_nms=False,
+ min_radius=[4, 12, 10, 1, 0.85, 0.175],
+ score_threshold=0.1,
+ pc_range=[-51.2, -51.2],
+ out_size_factor=4,
+ voxel_size=voxel_size[:2],
+ nms_type='rotate',
+ pre_max_size=1000,
+ post_max_size=83,
+ nms_thr=0.2)))
diff --git a/adzoo/bevformer/configs/_base_/models/fcos3d.py b/adzoo/bevformer/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000..92ea907
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/fcos3d.py
@@ -0,0 +1,74 @@
+model = dict(
+ type='FCOSMono3D',
+ pretrained='open-mmlab://detectron2/resnet101_caffe',
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='caffe'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ start_level=1,
+ add_extra_convs='on_output',
+ num_outs=5,
+ relu_before_extra_convs=True),
+ bbox_head=dict(
+ type='FCOSMono3DHead',
+ num_classes=10,
+ in_channels=256,
+ stacked_convs=2,
+ feat_channels=256,
+ use_direction_classifier=True,
+ diff_rad_by_sin=True,
+ pred_attrs=True,
+ pred_velo=True,
+ dir_offset=0.7854, # pi/4
+ strides=[8, 16, 32, 64, 128],
+ group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo
+ cls_branch=(256, ),
+ reg_branch=(
+ (256, ), # offset
+ (256, ), # depth
+ (256, ), # size
+ (256, ), # rot
+ () # velo
+ ),
+ dir_branch=(256, ),
+ attr_branch=(256, ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_attr=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_centerness=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ norm_on_bbox=True,
+ centerness_on_reg=True,
+ center_sampling=True,
+ conv_bias=True,
+ dcn_on_last_conv=True),
+ train_cfg=dict(
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_thr=0.8,
+ score_thr=0.05,
+ min_bbox_size=0,
+ max_per_img=200))
diff --git a/adzoo/bevformer/configs/_base_/models/groupfree3d.py b/adzoo/bevformer/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000..077d049
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,71 @@
+model = dict(
+ type='GroupFree3DNet',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=3,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 288)),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True)),
+ bbox_head=dict(
+ type='GroupFree3DHead',
+ in_channels=288,
+ num_decoder_layers=6,
+ num_proposal=256,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='GroupFree3DMHA',
+ embed_dims=288,
+ num_heads=8,
+ attn_drop=0.1,
+ dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+ ffn_cfgs=dict(
+ embed_dims=288,
+ feedforward_channels=2048,
+ ffn_drop=0.1,
+ act_cfg=dict(type='ReLU', inplace=True)),
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ pred_layer_cfg=dict(
+ in_channels=288, shared_conv_channels=(288, 288), bias=True),
+ sampling_objectness_loss=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=8.0),
+ objectness_loss=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ center_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(sample_mod='kps'),
+ test_cfg=dict(
+ sample_mod='kps',
+ nms_thr=0.25,
+ score_thr=0.0,
+ per_class_proposal=True,
+ prediction_stages='last'))
diff --git a/adzoo/bevformer/configs/_base_/models/h3dnet.py b/adzoo/bevformer/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000..7605667
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/h3dnet.py
@@ -0,0 +1,341 @@
+primitive_z_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=2,
+ num_classes=18,
+ primitive_mode='z',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=1,
+ num_classes=18,
+ primitive_mode='xy',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+primitive_line_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=0,
+ num_classes=18,
+ primitive_mode='line',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=1.0,
+ loss_dst_weight=1.0),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=1.0,
+ loss_dst_weight=1.0),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+model = dict(
+ type='H3DNet',
+ backbone=dict(
+ type='MultiBackbone',
+ num_streams=4,
+ suffixes=['net0', 'net1', 'net2', 'net3'],
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+ act_cfg=dict(type='ReLU'),
+ backbones=dict(
+ type='PointNet2SASSG',
+ in_channels=4,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 256)),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True))),
+ rpn_head=dict(
+ type='VoteHead',
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=3,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=256,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ pred_layer_cfg=dict(
+ in_channels=128, shared_conv_channels=(128, 128), bias=True),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ roi_head=dict(
+ type='H3DRoIHead',
+ primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+ bbox_head=dict(
+ type='H3DBboxHead',
+ gt_per_seed=3,
+ num_proposal=256,
+ suface_matching_cfg=dict(
+ type='PointSAModule',
+ num_point=256 * 6,
+ radius=0.5,
+ num_sample=32,
+ mlp_channels=[128 + 6, 128, 64, 32],
+ use_xyz=True,
+ normalize_xyz=True),
+ line_matching_cfg=dict(
+ type='PointSAModule',
+ num_point=256 * 12,
+ radius=0.5,
+ num_sample=32,
+ mlp_channels=[128 + 12, 128, 64, 32],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ primitive_refine_channels=[128, 128, 128],
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ line_thresh=0.5,
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ cues_objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.3, 0.7],
+ reduction='mean',
+ loss_weight=5.0),
+ cues_semantic_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.3, 0.7],
+ reduction='mean',
+ loss_weight=5.0),
+ proposal_objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='none',
+ loss_weight=5.0),
+ primitive_center_loss=dict(
+ type='MSELoss', reduction='none', loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+ rpn_proposal=dict(use_nms=False),
+ rcnn=dict(
+ pos_distance_thr=0.3,
+ neg_distance_thr=0.6,
+ sample_mod='vote',
+ far_threshold=0.6,
+ near_threshold=0.3,
+ mask_surface_threshold=0.3,
+ label_surface_threshold=0.3,
+ mask_line_threshold=0.3,
+ label_line_threshold=0.3)),
+ test_cfg=dict(
+ rpn=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True,
+ use_nms=False),
+ rcnn=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True)))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_lyft.py
new file mode 100644
index 0000000..87c7fe0
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+ pts_voxel_layer=dict(
+ max_num_points=20,
+ point_cloud_range=[-80, -80, -5, 80, 80, 3],
+ max_voxels=(60000, 60000)),
+ pts_voxel_encoder=dict(
+ feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+ pts_middle_encoder=dict(output_shape=[640, 640]),
+ pts_bbox_head=dict(
+ num_classes=9,
+ anchor_generator=dict(
+ ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+ # model training settings (based on nuScenes model settings)
+ train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_nus.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_nus.py
new file mode 100644
index 0000000..e153f6c
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_nus.py
@@ -0,0 +1,96 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+ type='MVXFasterRCNN',
+ pts_voxel_layer=dict(
+ max_num_points=64,
+ point_cloud_range=[-50, -50, -5, 50, 50, 3],
+ voxel_size=voxel_size,
+ max_voxels=(30000, 40000)),
+ pts_voxel_encoder=dict(
+ type='HardVFE',
+ in_channels=4,
+ feat_channels=[64, 64],
+ with_distance=False,
+ voxel_size=voxel_size,
+ with_cluster_center=True,
+ with_voxel_center=True,
+ point_cloud_range=[-50, -50, -5, 50, 50, 3],
+ norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+ pts_middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ layer_nums=[3, 5, 5],
+ layer_strides=[2, 2, 2],
+ out_channels=[64, 128, 256]),
+ pts_neck=dict(
+ type='FPN',
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ act_cfg=dict(type='ReLU'),
+ in_channels=[64, 128, 256],
+ out_channels=256,
+ start_level=0,
+ num_outs=3),
+ pts_bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=10,
+ in_channels=256,
+ feat_channels=256,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='AlignedAnchor3DRangeGenerator',
+ ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+ scales=[1, 2, 4],
+ sizes=[
+ [0.8660, 2.5981, 1.], # 1.5/sqrt(3)
+ [0.5774, 1.7321, 1.], # 1/sqrt(3)
+ [1., 1., 1.],
+ [0.4, 0.4, 1],
+ ],
+ custom_values=[0, 0],
+ rotations=[0, 1.57],
+ reshape_out=True),
+ assigner_per_size=False,
+ diff_rad_by_sin=True,
+ dir_offset=0.7854, # pi/4
+ dir_limit_offset=0,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ pts=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_thr=0.2,
+ score_thr=0.05,
+ min_bbox_size=0,
+ max_num=500)))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
new file mode 100644
index 0000000..9cd200f
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+ pts_voxel_layer=dict(
+ max_num_points=20,
+ point_cloud_range=[-100, -100, -5, 100, 100, 3],
+ max_voxels=(60000, 60000)),
+ pts_voxel_encoder=dict(
+ feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+ pts_middle_encoder=dict(output_shape=[800, 800]),
+ pts_bbox_head=dict(
+ num_classes=9,
+ anchor_generator=dict(
+ ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+ # model training settings (based on nuScenes model settings)
+ train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
new file mode 100644
index 0000000..85076d0
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -0,0 +1,93 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+ type='VoxelNet',
+ voxel_layer=dict(
+ max_num_points=32, # max_points_per_voxel
+ point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+ voxel_size=voxel_size,
+ max_voxels=(16000, 40000) # (training, testing) max_voxels
+ ),
+ voxel_encoder=dict(
+ type='PillarFeatureNet',
+ in_channels=4,
+ feat_channels=[64],
+ with_distance=False,
+ voxel_size=voxel_size,
+ point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+ middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+ backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ layer_nums=[3, 5, 5],
+ layer_strides=[2, 2, 2],
+ out_channels=[64, 128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ in_channels=[64, 128, 256],
+ upsample_strides=[1, 2, 4],
+ out_channels=[128, 128, 128]),
+ bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=384,
+ feat_channels=384,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='Anchor3DRangeGenerator',
+ ranges=[
+ [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+ [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+ [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+ ],
+ sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.45,
+ min_pos_iou=0.45,
+ ignore_iof_thr=-1),
+ ],
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_thr=0.01,
+ score_thr=0.1,
+ min_bbox_size=0,
+ nms_pre=100,
+ max_num=50))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
new file mode 100644
index 0000000..14873ea
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+ type='MVXFasterRCNN',
+ pts_voxel_layer=dict(
+ max_num_points=20,
+ point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+ voxel_size=voxel_size,
+ max_voxels=(32000, 32000)),
+ pts_voxel_encoder=dict(
+ type='HardVFE',
+ in_channels=5,
+ feat_channels=[64],
+ with_distance=False,
+ voxel_size=voxel_size,
+ with_cluster_center=True,
+ with_voxel_center=True,
+ point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+ norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+ pts_middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ layer_nums=[3, 5, 5],
+ layer_strides=[1, 2, 2],
+ out_channels=[64, 128, 256]),
+ pts_neck=dict(
+ type='SECONDFPN',
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ in_channels=[64, 128, 256],
+ upsample_strides=[1, 2, 4],
+ out_channels=[128, 128, 128]),
+ pts_bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=384,
+ feat_channels=384,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='AlignedAnchor3DRangeGenerator',
+ ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+ [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
+ [-74.88, -74.88, 0, 74.88, 74.88, 0]],
+ sizes=[
+ [2.08, 4.73, 1.77], # car
+ [0.84, 1.81, 1.77], # cyclist
+ [0.84, 0.91, 1.74] # pedestrian
+ ],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ dir_offset=0.7854, # pi/4
+ dir_limit_offset=0,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ assigner=[
+ dict( # car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.4,
+ min_pos_iou=0.4,
+ ignore_iof_thr=-1),
+ dict( # cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ dict( # pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ ],
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ pts=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=4096,
+ nms_thr=0.25,
+ score_thr=0.1,
+ min_bbox_size=0,
+ max_num=500)))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_kitti.py b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_kitti.py
new file mode 100644
index 0000000..6bf18ab
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_kitti.py
@@ -0,0 +1,89 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+ type='VoxelNet',
+ voxel_layer=dict(
+ max_num_points=5,
+ point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+ voxel_size=voxel_size,
+ max_voxels=(16000, 40000)),
+ voxel_encoder=dict(type='HardSimpleVFE'),
+ middle_encoder=dict(
+ type='SparseEncoder',
+ in_channels=4,
+ sparse_shape=[41, 1600, 1408],
+ order=('conv', 'norm', 'act')),
+ backbone=dict(
+ type='SECOND',
+ in_channels=256,
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ out_channels=[128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ in_channels=[128, 256],
+ upsample_strides=[1, 2],
+ out_channels=[256, 256]),
+ bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=512,
+ feat_channels=512,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='Anchor3DRangeGenerator',
+ ranges=[
+ [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+ ],
+ sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.35,
+ neg_iou_thr=0.2,
+ min_pos_iou=0.2,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.35,
+ neg_iou_thr=0.2,
+ min_pos_iou=0.2,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.45,
+ min_pos_iou=0.45,
+ ignore_iof_thr=-1),
+ ],
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_thr=0.01,
+ score_thr=0.1,
+ min_bbox_size=0,
+ nms_pre=100,
+ max_num=50))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_waymo.py b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_waymo.py
new file mode 100644
index 0000000..eb9bd3a
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_waymo.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+ type='VoxelNet',
+ voxel_layer=dict(
+ max_num_points=10,
+ point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+ voxel_size=voxel_size,
+ max_voxels=(80000, 90000)),
+ voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+ middle_encoder=dict(
+ type='SparseEncoder',
+ in_channels=5,
+ sparse_shape=[61, 1280, 1920],
+ order=('conv', 'norm', 'act')),
+ backbone=dict(
+ type='SECOND',
+ in_channels=384,
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ out_channels=[128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ in_channels=[128, 256],
+ upsample_strides=[1, 2],
+ out_channels=[256, 256]),
+ bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=512,
+ feat_channels=512,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='AlignedAnchor3DRangeGenerator',
+ ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+ [-76.8, -51.2, 0, 76.8, 51.2, 0],
+ [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+ sizes=[
+ [2.08, 4.73, 1.77], # car
+ [0.84, 0.91, 1.74], # pedestrian
+ [0.84, 1.81, 1.77] # cyclist
+ ],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ dir_offset=0.7854, # pi/4
+ dir_limit_offset=0,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=[
+ dict( # car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.4,
+ min_pos_iou=0.4,
+ ignore_iof_thr=-1),
+ dict( # pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ dict( # cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1)
+ ],
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=4096,
+ nms_thr=0.25,
+ score_thr=0.1,
+ min_bbox_size=0,
+ max_num=500))
diff --git a/adzoo/bevformer/configs/_base_/models/imvotenet_image.py b/adzoo/bevformer/configs/_base_/models/imvotenet_image.py
new file mode 100644
index 0000000..981f8bc
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/imvotenet_image.py
@@ -0,0 +1,108 @@
+model = dict(
+ type='ImVoteNet',
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ img_rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ img_roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=10,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+ # model training and testing settings
+ train_cfg=dict(
+ img_rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ img_rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ img_rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ img_rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ img_rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)))
diff --git a/adzoo/bevformer/configs/_base_/models/mask_rcnn_r50_fpn.py b/adzoo/bevformer/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..c5d5e32
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+ type='MaskRCNN',
+ pretrained='torchvision://resnet50',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ mask_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ mask_head=dict(
+ type='FCNMaskHead',
+ num_convs=4,
+ in_channels=256,
+ conv_out_channels=256,
+ num_classes=80,
+ loss_mask=dict(
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100,
+ mask_thr_binary=0.5)))
diff --git a/adzoo/bevformer/configs/_base_/models/paconv_cuda_ssg.py b/adzoo/bevformer/configs/_base_/models/paconv_cuda_ssg.py
new file mode 100644
index 0000000..f513bd4
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/paconv_cuda_ssg.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+ backbone=dict(
+ sa_cfg=dict(
+ type='PAConvCUDASAModule',
+ scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/adzoo/bevformer/configs/_base_/models/paconv_ssg.py b/adzoo/bevformer/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000..1d4f1ed
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,49 @@
+# model settings
+model = dict(
+ type='EncoderDecoder3D',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=9, # [xyz, rgb, normalized_xyz]
+ num_points=(1024, 256, 64, 16),
+ radius=(None, None, None, None), # use kNN instead of ball query
+ num_samples=(32, 32, 32, 32),
+ sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+ 512)),
+ fp_channels=(),
+ norm_cfg=dict(type='BN2d', momentum=0.1),
+ sa_cfg=dict(
+ type='PAConvSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False,
+ paconv_num_kernels=[16, 16, 16],
+ paconv_kernel_input='w_neighbor',
+ scorenet_input='w_neighbor_dist',
+ scorenet_cfg=dict(
+ mlp_channels=[16, 16, 16],
+ score_norm='softmax',
+ temp_factor=1.0,
+ last_bn=False))),
+ decode_head=dict(
+ type='PAConvHead',
+ # PAConv model's decoder takes skip connections from beckbone
+ # different from PointNet++, it also concats input features in the last
+ # level of decoder, leading to `128 + 6` as the channel number
+ fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+ (128 + 6, 128, 128, 128)),
+ channels=128,
+ dropout_ratio=0.5,
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ act_cfg=dict(type='ReLU'),
+ loss_decode=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ class_weight=None, # should be modified with dataset
+ loss_weight=1.0)),
+ # correlation loss to regularize PAConv's kernel weights
+ loss_regularization=dict(
+ type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide'))
diff --git a/adzoo/bevformer/configs/_base_/models/parta2.py b/adzoo/bevformer/configs/_base_/models/parta2.py
new file mode 100644
index 0000000..6c5ae9a
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/parta2.py
@@ -0,0 +1,201 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+ type='PartA2',
+ voxel_layer=dict(
+ max_num_points=5, # max_points_per_voxel
+ point_cloud_range=point_cloud_range,
+ voxel_size=voxel_size,
+ max_voxels=(16000, 40000) # (training, testing) max_voxels
+ ),
+ voxel_encoder=dict(type='HardSimpleVFE'),
+ middle_encoder=dict(
+ type='SparseUNet',
+ in_channels=4,
+ sparse_shape=[41, 1600, 1408],
+ order=('conv', 'norm', 'act')),
+ backbone=dict(
+ type='SECOND',
+ in_channels=256,
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ out_channels=[128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ in_channels=[128, 256],
+ upsample_strides=[1, 2],
+ out_channels=[256, 256]),
+ rpn_head=dict(
+ type='PartA2RPNHead',
+ num_classes=3,
+ in_channels=512,
+ feat_channels=512,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='Anchor3DRangeGenerator',
+ ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+ sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ assigner_per_size=True,
+ assign_per_class=True,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ roi_head=dict(
+ type='PartAggregationROIHead',
+ num_classes=3,
+ semantic_head=dict(
+ type='PointwiseSemanticHead',
+ in_channels=16,
+ extra_width=0.2,
+ seg_score_thr=0.3,
+ num_classes=3,
+ loss_seg=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_part=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+ seg_roi_extractor=dict(
+ type='Single3DRoIAwareExtractor',
+ roi_layer=dict(
+ type='RoIAwarePool3d',
+ out_size=14,
+ max_pts_per_voxel=128,
+ mode='max')),
+ part_roi_extractor=dict(
+ type='Single3DRoIAwareExtractor',
+ roi_layer=dict(
+ type='RoIAwarePool3d',
+ out_size=14,
+ max_pts_per_voxel=128,
+ mode='avg')),
+ bbox_head=dict(
+ type='PartA2BboxHead',
+ num_classes=3,
+ seg_in_channels=16,
+ part_in_channels=4,
+ seg_conv_channels=[64, 64],
+ part_conv_channels=[64, 64],
+ merge_conv_channels=[128, 128],
+ down_conv_channels=[128, 256],
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ shared_fc_channels=[256, 512, 512, 512],
+ cls_channels=[256, 256],
+ reg_channels=[256, 256],
+ dropout_ratio=0.1,
+ roi_feat_size=14,
+ with_corner_loss=True,
+ loss_bbox=dict(
+ type='SmoothL1Loss',
+ beta=1.0 / 9.0,
+ reduction='sum',
+ loss_weight=1.0),
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.45,
+ min_pos_iou=0.45,
+ ignore_iof_thr=-1)
+ ],
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=9000,
+ nms_post=512,
+ max_num=512,
+ nms_thr=0.8,
+ score_thr=0,
+ use_rotate_nms=False),
+ rcnn=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(
+ type='BboxOverlaps3D', coordinate='lidar'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.55,
+ min_pos_iou=0.55,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(
+ type='BboxOverlaps3D', coordinate='lidar'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.55,
+ min_pos_iou=0.55,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(
+ type='BboxOverlaps3D', coordinate='lidar'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.55,
+ min_pos_iou=0.55,
+ ignore_iof_thr=-1)
+ ],
+ sampler=dict(
+ type='IoUNegPiecewiseSampler',
+ num=128,
+ pos_fraction=0.55,
+ neg_piece_fractions=[0.8, 0.2],
+ neg_iou_piece_thrs=[0.55, 0.1],
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False,
+ return_iou=True),
+ cls_pos_thr=0.75,
+ cls_neg_thr=0.25)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1024,
+ nms_post=100,
+ max_num=100,
+ nms_thr=0.7,
+ score_thr=0,
+ use_rotate_nms=True),
+ rcnn=dict(
+ use_rotate_nms=True,
+ use_raw_score=True,
+ nms_thr=0.01,
+ score_thr=0.1)))
diff --git a/adzoo/bevformer/configs/_base_/models/pointnet2_msg.py b/adzoo/bevformer/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000..222ab88
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+ backbone=dict(
+ _delete_=True,
+ type='PointNet2SAMSG',
+ in_channels=6, # [xyz, rgb], should be modified with dataset
+ num_points=(1024, 256, 64, 16),
+ radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+ num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+ sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+ 128)),
+ ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+ (256, 384, 512))),
+ aggregation_channels=(None, None, None, None),
+ fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+ fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+ dilated_group=(False, False, False, False),
+ out_indices=(0, 1, 2, 3),
+ sa_cfg=dict(
+ type='PointSAModuleMSG',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False)),
+ decode_head=dict(
+ fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+ (128, 128, 128, 128))))
diff --git a/adzoo/bevformer/configs/_base_/models/pointnet2_ssg.py b/adzoo/bevformer/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000..58b4c24
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+ type='EncoderDecoder3D',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=6, # [xyz, rgb], should be modified with dataset
+ num_points=(1024, 256, 64, 16),
+ radius=(0.1, 0.2, 0.4, 0.8),
+ num_samples=(32, 32, 32, 32),
+ sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+ 512)),
+ fp_channels=(),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False)),
+ decode_head=dict(
+ type='PointNet2Head',
+ fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+ (128, 128, 128, 128)),
+ channels=128,
+ dropout_ratio=0.5,
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ act_cfg=dict(type='ReLU'),
+ loss_decode=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ class_weight=None, # should be modified with dataset
+ loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide'))
diff --git a/adzoo/bevformer/configs/_base_/models/votenet.py b/adzoo/bevformer/configs/_base_/models/votenet.py
new file mode 100644
index 0000000..129339d
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+ type='VoteNet',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=4,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 256)),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True)),
+ bbox_head=dict(
+ type='VoteHead',
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=3,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=256,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ pred_layer_cfg=dict(
+ in_channels=128, shared_conv_channels=(128, 128), bias=True),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(
+ pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+ test_cfg=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True))
diff --git a/adzoo/bevformer/configs/_base_/schedules/cosine.py b/adzoo/bevformer/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000..69cb7df
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/cosine.py
@@ -0,0 +1,20 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003 # max learning rate
+optimizer = dict(
+ type='AdamW',
+ lr=lr,
+ betas=(0.95, 0.99), # the momentum is change during training
+ weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=1000,
+ warmup_ratio=1.0 / 10,
+ min_lr_ratio=1e-5)
+
+momentum_config = None
+
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/bevformer/configs/_base_/schedules/cyclic_20e.py b/adzoo/bevformer/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000..704740e
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,24 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+ policy='cyclic',
+ target_ratio=(10, 1e-4),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+momentum_config = dict(
+ policy='cyclic',
+ target_ratio=(0.85 / 0.95, 1),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/adzoo/bevformer/configs/_base_/schedules/cyclic_40e.py b/adzoo/bevformer/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000..4a711ac
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,31 @@
+# The schedule is usually used by models trained on KITTI dataset
+
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the offcial AdamW optimizer implemented by PyTorch.
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa
+# We implement them in mmcv, for more details, please refer to
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa
+lr_config = dict(
+ policy='cyclic',
+ target_ratio=(10, 1e-4),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+momentum_config = dict(
+ policy='cyclic',
+ target_ratio=(0.85 / 0.95, 1),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/bevformer/configs/_base_/schedules/mmdet_schedule_1x.py b/adzoo/bevformer/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000..13b3783
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/adzoo/bevformer/configs/_base_/schedules/schedule_2x.py b/adzoo/bevformer/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000..afde799
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,14 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=1000,
+ warmup_ratio=1.0 / 1000,
+ step=[20, 23])
+momentum_config = None
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/adzoo/bevformer/configs/_base_/schedules/schedule_3x.py b/adzoo/bevformer/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000..115cd26
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008 # max learning rate
+optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/adzoo/bevformer/configs/_base_/schedules/seg_cosine_150e.py b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000..04b44e5
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/adzoo/bevformer/configs/_base_/schedules/seg_cosine_200e.py b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000..6a49484
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on ScanNet dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/adzoo/bevformer/configs/_base_/schedules/seg_cosine_50e.py b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000..975a8f9
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_base.py b/adzoo/bevformer/configs/bevformer/bevformer_base.py
new file mode 100644
index 0000000..c67c978
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_base.py
@@ -0,0 +1,260 @@
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+img_norm_cfg = dict(
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+num_cams = 6
+model = dict(
+ type='BEVFormer',
+ use_grid_mask=True,
+ video_test_mode=True,
+ img_backbone=dict(
+ type='ResNet',
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN2d', requires_grad=False),
+ norm_eval=True,
+ style='caffe',
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True)),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=4,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='BEVFormerPerceptionTransformer',
+ num_cams=num_cams,
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ num_cams=num_cams,
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+anno_root = 'data/infos/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=1,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=anno_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
+log_config = dict(
+ interval=1,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+
+checkpoint_config = dict(interval=1)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py b/adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py
new file mode 100644
index 0000000..f987e1b
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py
@@ -0,0 +1,363 @@
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+
+
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+img_norm_cfg = dict(
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+
+model = dict(
+ type='BEVFormer',
+ use_grid_mask=True,
+ video_test_mode=True,
+ img_backbone=dict(
+ type='ResNet',
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN2d', requires_grad=False),
+ norm_eval=True,
+ style='caffe',
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True)),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=4,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=len(class_names),
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='BEVFormerPerceptionTransformer',
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=len(class_names)),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range))))
+
+dataset_type = "B2D_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=6,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ sample_interval=5,
+ name_mapping=NameMapping,
+ eval_cfg=eval_cfg,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1,sample_interval=5, name_mapping=NameMapping,eval_cfg=eval_cfg,),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality,sample_interval=5,
+ name_mapping=NameMapping,eval_cfg=eval_cfg,),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
+log_config = dict(
+ interval=1,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+
+checkpoint_config = dict(interval=1)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_tiny.py b/adzoo/bevformer/configs/bevformer/bevformer_tiny.py
new file mode 100644
index 0000000..78858ee
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_tiny.py
@@ -0,0 +1,270 @@
+# BEvFormer-tiny consumes at lease 6700M GPU memory
+# compared to bevformer_base, bevformer_tiny has
+# smaller backbone: R101-DCN -> R50
+# smaller BEV: 200*200 -> 50*50
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> 800*450
+# multi-scale feautres -> single scale features (C5)
+
+
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 50
+bev_w_ = 50
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+ type='BEVFormer',
+ use_grid_mask=True,
+ video_test_mode=True,
+ pretrained=dict(img='torchvision://resnet50'),
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(3,),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformer',
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=3,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+
+checkpoint_config = dict(interval=1)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py b/adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py
new file mode 100644
index 0000000..d4f92f7
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py
@@ -0,0 +1,360 @@
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+
+
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+img_norm_cfg = dict(
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 100
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+ type='BEVFormer',
+ use_grid_mask=True,
+ video_test_mode=True,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1,2,3),
+ frozen_stages=4,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=len(class_names),
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='BEVFormerPerceptionTransformer',
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=3,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=_dim_//32,
+ dropout=0.0),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=len(class_names)),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range))))
+
+dataset_type = "B2D_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=6,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ sample_interval=5,
+ name_mapping=NameMapping,
+ eval_cfg=eval_cfg,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1,sample_interval=5, name_mapping=NameMapping,eval_cfg=eval_cfg,),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality,sample_interval=5,
+ name_mapping=NameMapping,eval_cfg=eval_cfg,),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ by_epoch=False,
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+
+checkpoint_config = dict(interval=3000, by_epoch=False)
diff --git a/adzoo/bevformer/configs/bevformer_fp16/bevformer_tiny_fp16.py b/adzoo/bevformer/configs/bevformer_fp16/bevformer_tiny_fp16.py
new file mode 100644
index 0000000..aa1e043
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer_fp16/bevformer_tiny_fp16.py
@@ -0,0 +1,272 @@
+# BEvFormer-tiny consumes at lease 6700M GPU memory
+# compared to bevformer_base, bevformer_tiny has
+# smaller backbone: R101-DCN -> R50
+# smaller BEV: 200*200 -> 50*50
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> 800*450
+# multi-scale feautres -> single scale features (C5)
+
+
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 50
+bev_w_ = 50
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+ type='BEVFormer_fp16',
+ use_grid_mask=True,
+ video_test_mode=True,
+ pretrained=dict(img='torchvision://resnet50'),
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(3,),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformer',
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=3,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=8,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2.8e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner_video', max_epochs=total_epochs)
+
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
+custom_hooks = [dict(type='TransferWeight',priority='LOWEST')]
\ No newline at end of file
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
new file mode 100644
index 0000000..594f34b
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.3805
+# mATE: 0.7198
+# mASE: 0.2805
+# mAOE: 0.4131
+# mAVE: 0.7652
+# mAAE: 0.1951
+# NDS: 0.4529
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": True,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='GlobalRotScaleTransImage',
+ rot_range=[-22.5, 22.5],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0],
+ reverse_angle=True,
+ training=True,
+ flip_dx_ratio=0.5,
+ flip_dy_ratio=0.5,
+ only_gt=True,),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead_GroupDETR',
+ group_detr=group_detr,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='GroupMultiheadAttention',
+ group=group_detr,
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-48ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
new file mode 100644
index 0000000..a720051
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.3953
+# mATE: 0.6941
+# mASE: 0.2765
+# mAOE: 0.4199
+# mAVE: 0.7537
+# mAAE: 0.1866
+# NDS: 0.4646
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": True,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='GlobalRotScaleTransImage',
+ rot_range=[-22.5, 22.5],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0],
+ reverse_angle=True,
+ training=True,
+ flip_dx_ratio=0.5,
+ flip_dy_ratio=0.5,
+ only_gt=True,),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead_GroupDETR',
+ group_detr=group_detr,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='GroupMultiheadAttention',
+ group=group_detr,
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[44, ])
+total_epochs = 48
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
new file mode 100644
index 0000000..10330cf
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
@@ -0,0 +1,349 @@
+# mAP: 0.3512
+# mATE: 0.7534
+# mASE: 0.2863
+# mAOE: 0.4665
+# mAVE: 0.8070
+# mAAE: 0.1861
+# NDS: 0.4257
+
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
new file mode 100644
index 0000000..9c6d3cc
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
@@ -0,0 +1,349 @@
+# mAP: 0.3594
+# mATE: 0.7327
+# mASE: 0.2814
+# mAOE: 0.4074
+# mAVE: 0.7831
+# mAAE: 0.1983
+# NDS: 0.4394
+
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[44, ])
+total_epochs = 48
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
new file mode 100644
index 0000000..05bf708
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.4199
+# mATE: 0.6689
+# mASE: 0.2814
+# mAOE: 0.3915
+# mAVE: 0.3834
+# mAAE: 0.1928
+# NDS: 0.5182
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (-1, 0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": True,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='GlobalRotScaleTransImage',
+ rot_range=[-22.5, 22.5],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0],
+ reverse_angle=True,
+ training=True,
+ flip_dx_ratio=0.5,
+ flip_dy_ratio=0.5,
+ only_gt=True,),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead_GroupDETR',
+ group_detr=group_detr,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='GroupMultiheadAttention',
+ group=group_detr,
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-48ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
new file mode 100644
index 0000000..2c1dab2
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.4313
+# mATE: 0.6557
+# mASE: 0.2775
+# mAOE: 0.3851
+# mAVE: 0.3861
+# mAAE: 0.1882
+# NDS: 0.5264
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (-1, 0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": True,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='GlobalRotScaleTransImage',
+ rot_range=[-22.5, 22.5],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0],
+ reverse_angle=True,
+ training=True,
+ flip_dx_ratio=0.5,
+ flip_dy_ratio=0.5,
+ only_gt=True,),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead_GroupDETR',
+ group_detr=group_detr,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='GroupMultiheadAttention',
+ group=group_detr,
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t8-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
new file mode 100644
index 0000000..76cca1e
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
@@ -0,0 +1,361 @@
+# mAP: 0.4600
+# mATE: 0.6185
+# mASE: 0.2815
+# mAOE: 0.3660
+# mAVE: 0.3157
+# mAAE: 0.1902
+# NDS: 0.5528
+_base_ = [
+ '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+ 'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (-7,-6,-5,-4,-3,-2,-1,0)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+ "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768], # (0.8, 1.2)
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": True,
+}
+ida_aug_conf_eval = {
+ "reisze": [640, ],
+ "crop": (0, 260, 1600, 900),
+ "H": 900,
+ "W": 1600,
+ "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='GlobalRotScaleTransImage',
+ rot_range=[-22.5, 22.5],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0],
+ reverse_angle=True,
+ training=True,
+ flip_dx_ratio=0.5,
+ flip_dy_ratio=0.5,
+ only_gt=True,),
+ dict(
+ type='ObjectRangeFilter',
+ point_cloud_range=point_cloud_range),
+ dict(
+ type='ObjectNameFilter',
+ classes=class_names),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='CustomCollect3D',
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+ 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+ 'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+ dict(type='DD3DMapper',
+ is_train=True,
+ tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+ dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 640),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D',
+ keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+ 'lidar2ego_rotation', 'timestamp'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ persistent_workers=True,
+ train=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ box_type_3d='LiDAR',
+ mono_cfg=dict(
+ name='nusc_trainval',
+ data_root='data/nuscenes/',
+ min_num_lidar_points=3,
+ min_box_visibility=0.2)),
+ val=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1),
+ test=dict(
+ type='CustomNuScenesDatasetV2',
+ frames=frames,
+ data_root='data/nuscenes/',
+ ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+ pipeline=eval_pipeline,
+ classes=class_names,
+ modality=input_modality),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+ type='BEVFormerV2',
+ use_grid_mask=True,
+ video_test_mode=False,
+ num_levels=_num_levels_,
+ num_mono_levels=_num_mono_levels_,
+ mono_loss_weight=1.0,
+ frames=frames,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=-1,
+ norm_cfg=dict(type='SyncBN'),
+ norm_eval=False,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_mono_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='BEVFormerHead_GroupDETR',
+ group_detr=group_detr,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ transformer=dict(
+ type='PerceptionTransformerV2',
+ embed_dims=_dim_,
+ frames=frames,
+ inter_channels=_dim_*2,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=4),
+ embed_dims=_dim_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='GroupMultiheadAttention',
+ group=group_detr,
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='NMSFreeCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+ fcos3d_bbox_head=dict(
+ type='NuscenesDD3D',
+ num_classes=10,
+ in_channels=_dim_,
+ strides=[8, 16, 32, 64, 128],
+ box3d_on=True,
+ feature_locations_offset='none',
+ fcos2d_cfg=dict(
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='SyncBN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0),
+ fcos2d_loss_cfg=dict(
+ focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+ fcos3d_cfg=dict(
+ num_convs=4,
+ norm='SyncBN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+ std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+ fcos3d_loss_cfg=dict(
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+ [0.61416006, 1.7016163, 1.3054738],
+ [2.9139307, 10.725025, 3.2832346],
+ [1.9751819, 4.641267, 1.74352],
+ [2.772134, 6.565072, 3.2474296],
+ [0.7800532, 2.138673, 1.4437162],
+ [0.6667362, 0.7181772, 1.7616143],
+ [0.40246472, 0.4027083, 1.0084083],
+ [3.0059454, 12.8197, 4.1213827],
+ [2.4986045, 6.9310856, 2.8382742]]),
+ target_assign_cfg=dict(
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+ (512, 100000000.0))),
+ nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+ type='AdamW',
+ lr=4e-4,
+ paramwise_cfg=dict(
+ custom_keys=dict(
+ img_backbone=dict(lr_mult=0.5),
+ )),
+ weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=2000,
+ warmup_ratio=1.0 / 3,
+ step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/datasets/custom_lyft-3d.py b/adzoo/bevformer/configs/datasets/custom_lyft-3d.py
new file mode 100644
index 0000000..5a95d89
--- /dev/null
+++ b/adzoo/bevformer/configs/datasets/custom_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+ 'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'CustomLyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/lyft/': 's3://lyft/lyft/',
+# 'data/lyft/': 's3://lyft/lyft/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
diff --git a/adzoo/bevformer/configs/datasets/custom_nus-3d.py b/adzoo/bevformer/configs/datasets/custom_nus-3d.py
new file mode 100644
index 0000000..af81f9b
--- /dev/null
+++ b/adzoo/bevformer/configs/datasets/custom_nus-3d.py
@@ -0,0 +1,141 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset_eval_modified'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/datasets/custom_waymo-3d.py b/adzoo/bevformer/configs/datasets/custom_waymo-3d.py
new file mode 100644
index 0000000..4100e13
--- /dev/null
+++ b/adzoo/bevformer/configs/datasets/custom_waymo-3d.py
@@ -0,0 +1,112 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'CustomWaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+img_norm_cfg = dict(
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=False, use_camera=True)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'waymo_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(
+ filter_by_difficulty=[-1],
+ filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+ classes=class_names,
+ sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+ points_loader=dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=[0, 1, 2, 3, 4],
+ file_client_args=file_client_args))
+
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1920, 1280),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_train.pkl',
+ split='training',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ # load one frame every five frames
+ load_interval=5)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
diff --git a/adzoo/bevformer/create_data.py b/adzoo/bevformer/create_data.py
new file mode 100755
index 0000000..f2b0cc1
--- /dev/null
+++ b/adzoo/bevformer/create_data.py
@@ -0,0 +1,305 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+from data_converter.create_gt_database import create_groundtruth_database
+from data_converter import nuscenes_converter as nuscenes_converter
+from data_converter import lyft_converter as lyft_converter
+from data_converter import kitti_converter as kitti
+from data_converter import indoor_converter as indoor
+import argparse
+from os import path as osp
+import sys
+sys.path.append('.')
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+ """Prepare data related to Kitti dataset.
+
+ Related data consists of '.pkl' files recording basic infos,
+ 2D annotations and groundtruth database.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ out_dir (str): Output directory of the groundtruth database info.
+ """
+ kitti.create_kitti_info_file(root_path, info_prefix)
+ kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+ info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+ info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+ info_trainval_path = osp.join(root_path,
+ f'{info_prefix}_infos_trainval.pkl')
+ info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+ kitti.export_2d_annotation(root_path, info_train_path)
+ kitti.export_2d_annotation(root_path, info_val_path)
+ kitti.export_2d_annotation(root_path, info_trainval_path)
+ kitti.export_2d_annotation(root_path, info_test_path)
+
+ create_groundtruth_database(
+ 'KittiDataset',
+ root_path,
+ info_prefix,
+ f'{out_dir}/{info_prefix}_infos_train.pkl',
+ relative_path=False,
+ mask_anno_path='instances_train.json',
+ with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+ can_bus_root_path,
+ info_prefix,
+ version,
+ dataset_name,
+ out_dir,
+ max_sweeps=10):
+ """Prepare data related to nuScenes dataset.
+
+ Related data consists of '.pkl' files recording basic infos,
+ 2D annotations and groundtruth database.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ dataset_name (str): The dataset class name.
+ out_dir (str): Output directory of the groundtruth database info.
+ max_sweeps (int): Number of input consecutive frames. Default: 10
+ """
+ nuscenes_converter.create_nuscenes_infos(
+ root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+ if version == 'v1.0-test':
+ info_test_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_test_path, version=version)
+ else:
+ info_train_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+ info_val_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_train_path, version=version)
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_val_path, version=version)
+ # create_groundtruth_database(dataset_name, root_path, info_prefix,
+ # f'{out_dir}/{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+ """Prepare data related to Lyft dataset.
+
+ Related data consists of '.pkl' files recording basic infos.
+ Although the ground truth database and 2D annotations are not used in
+ Lyft, it can also be generated like nuScenes.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ max_sweeps (int, optional): Number of input consecutive frames.
+ Defaults to 10.
+ """
+ lyft_converter.create_lyft_infos(
+ root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+ """Prepare the info file for scannet dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ """
+ indoor.create_indoor_info_file(
+ root_path, info_prefix, out_dir, workers=workers)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+ """Prepare the info file for s3dis dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ """
+ indoor.create_indoor_info_file(
+ root_path, info_prefix, out_dir, workers=workers)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+ """Prepare the info file for sunrgbd dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ """
+ indoor.create_indoor_info_file(
+ root_path, info_prefix, out_dir, workers=workers)
+
+
+def waymo_data_prep(root_path,
+ info_prefix,
+ version,
+ out_dir,
+ workers,
+ max_sweeps=5):
+ """Prepare the info file for waymo dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ max_sweeps (int): Number of input consecutive frames. Default: 5 \
+ Here we store pose information of these frames for later use.
+ """
+ from tools.data_converter import waymo_converter as waymo
+
+ splits = ['training', 'validation', 'testing']
+
+ for i, split in enumerate(splits):
+ load_dir = osp.join(root_path, 'waymo_format', split)
+ if split == 'validation':
+ save_dir = osp.join(out_dir, 'kitti_format', 'training')
+ else:
+ save_dir = osp.join(out_dir, 'kitti_format', split)
+ converter = waymo.Waymo2KITTI(
+ load_dir,
+ save_dir,
+ prefix=str(i),
+ workers=workers,
+ test_mode=(split == 'test'))
+ converter.convert()
+ # Generate waymo infos
+ out_dir = osp.join(out_dir, 'kitti_format')
+ kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps)
+
+ create_groundtruth_database(
+ 'WaymoDataset',
+ out_dir,
+ info_prefix,
+ f'{out_dir}/{info_prefix}_infos_train.pkl',
+ relative_path=False,
+ with_mask=False)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+ '--root-path',
+ type=str,
+ default='./data/kitti',
+ help='specify the root path of dataset')
+parser.add_argument(
+ '--canbus',
+ type=str,
+ default='./data',
+ help='specify the root path of nuScenes canbus')
+parser.add_argument(
+ '--version',
+ type=str,
+ default='v1.0',
+ required=False,
+ help='specify the dataset version, no need for kitti')
+parser.add_argument(
+ '--max-sweeps',
+ type=int,
+ default=10,
+ required=False,
+ help='specify sweeps of lidar per example')
+parser.add_argument(
+ '--out-dir',
+ type=str,
+ default='./data/kitti',
+ required='False',
+ help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+ '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+ if args.dataset == 'kitti':
+ kitti_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=args.version,
+ out_dir=args.out_dir)
+ elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+ train_version = f'{args.version}-trainval'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ test_version = f'{args.version}-test'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=test_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+ train_version = f'{args.version}'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'lyft':
+ train_version = f'{args.version}-train'
+ lyft_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ max_sweeps=args.max_sweeps)
+ test_version = f'{args.version}-test'
+ lyft_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=test_version,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'waymo':
+ waymo_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=args.version,
+ out_dir=args.out_dir,
+ workers=args.workers,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'scannet':
+ scannet_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ out_dir=args.out_dir,
+ workers=args.workers)
+ elif args.dataset == 's3dis':
+ s3dis_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ out_dir=args.out_dir,
+ workers=args.workers)
+ elif args.dataset == 'sunrgbd':
+ sunrgbd_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ out_dir=args.out_dir,
+ workers=args.workers)
diff --git a/adzoo/bevformer/data_converter/__init__.py b/adzoo/bevformer/data_converter/__init__.py
new file mode 100755
index 0000000..ef101fe
--- /dev/null
+++ b/adzoo/bevformer/data_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/adzoo/bevformer/data_converter/create_gt_database.py b/adzoo/bevformer/data_converter/create_gt_database.py
new file mode 100755
index 0000000..6be53ec
--- /dev/null
+++ b/adzoo/bevformer/data_converter/create_gt_database.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pickle
+from mmcv import track_iter_progress
+from mmcv.ops import roi_align
+from os import path as osp
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmcv.core.bbox import box_np_ops as box_np_ops
+from mmcv.datasets import build_dataset
+from mmcv.core.evaluation.bbox_overlaps import bbox_overlaps
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+ if isinstance(mask_ann, list):
+ # polygon -- a single object might consist of multiple parts
+ # we merge all parts into one mask rle code
+ rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+ rle = maskUtils.merge(rles)
+ elif isinstance(mask_ann['counts'], list):
+ # uncompressed RLE
+ rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+ else:
+ # rle
+ rle = mask_ann
+ mask = maskUtils.decode(rle)
+ return mask
+
+
+def _parse_coco_ann_info(ann_info):
+ gt_bboxes = []
+ gt_labels = []
+ gt_bboxes_ignore = []
+ gt_masks_ann = []
+
+ for i, ann in enumerate(ann_info):
+ if ann.get('ignore', False):
+ continue
+ x1, y1, w, h = ann['bbox']
+ if ann['area'] <= 0:
+ continue
+ bbox = [x1, y1, x1 + w, y1 + h]
+ if ann.get('iscrowd', False):
+ gt_bboxes_ignore.append(bbox)
+ else:
+ gt_bboxes.append(bbox)
+ gt_masks_ann.append(ann['segmentation'])
+
+ if gt_bboxes:
+ gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+ gt_labels = np.array(gt_labels, dtype=np.int64)
+ else:
+ gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+ gt_labels = np.array([], dtype=np.int64)
+
+ if gt_bboxes_ignore:
+ gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+ else:
+ gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+ ann = dict(
+ bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+ return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+ import torch
+ from torch.nn.modules.utils import _pair
+ device = pos_proposals.device
+ num_pos = pos_proposals.size(0)
+ fake_inds = (
+ torch.arange(num_pos,
+ device=device).to(dtype=pos_proposals.dtype)[:, None])
+ rois = torch.cat([fake_inds, pos_proposals], dim=1) # Nx5
+ mask_size = _pair(28)
+ rois = rois.to(device=device)
+ gt_masks_th = (
+ torch.from_numpy(gt_masks).to(device).index_select(
+ 0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+ # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+ targets = (
+ roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+ return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+ num_pos = pos_proposals.shape[0]
+ masks = []
+ img_patches = []
+ for i in range(num_pos):
+ gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+ bbox = pos_proposals[i, :].astype(np.int32)
+ x1, y1, x2, y2 = bbox
+ w = np.maximum(x2 - x1 + 1, 1)
+ h = np.maximum(y2 - y1 + 1, 1)
+
+ mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+ masked_img = gt_mask[..., None] * org_img
+ img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+ img_patches.append(img_patch)
+ masks.append(mask_patch)
+ return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+ data_path,
+ info_prefix,
+ info_path=None,
+ mask_anno_path=None,
+ used_classes=None,
+ database_save_path=None,
+ db_info_save_path=None,
+ relative_path=True,
+ add_rgb=False,
+ lidar_only=False,
+ bev_only=False,
+ coors_range=None,
+ with_mask=False):
+ """Given the raw data, generate the ground truth database.
+
+ Args:
+ dataset_class_name (str): Name of the input dataset.
+ data_path (str): Path of the data.
+ info_prefix (str): Prefix of the info file.
+ info_path (str): Path of the info file.
+ Default: None.
+ mask_anno_path (str): Path of the mask_anno.
+ Default: None.
+ used_classes (list[str]): Classes have been used.
+ Default: None.
+ database_save_path (str): Path to save database.
+ Default: None.
+ db_info_save_path (str): Path to save db_info.
+ Default: None.
+ relative_path (bool): Whether to use relative path.
+ Default: True.
+ with_mask (bool): Whether to use mask.
+ Default: False.
+ """
+ print(f'Create GT Database of {dataset_class_name}')
+ dataset_cfg = dict(
+ type=dataset_class_name, data_root=data_path, ann_file=info_path)
+ if dataset_class_name == 'KittiDataset':
+ file_client_args = dict(backend='disk')
+ dataset_cfg.update(
+ test_mode=False,
+ split='training',
+ modality=dict(
+ use_lidar=True,
+ use_depth=False,
+ use_lidar_intensity=True,
+ use_camera=with_mask,
+ ),
+ pipeline=[
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args)
+ ])
+
+ elif dataset_class_name == 'NuScenesDataset':
+ dataset_cfg.update(
+ use_valid_flag=True,
+ pipeline=[
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ use_dim=[0, 1, 2, 3, 4],
+ pad_empty_sweeps=True,
+ remove_close=True),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True)
+ ])
+
+ elif dataset_class_name == 'WaymoDataset':
+ file_client_args = dict(backend='disk')
+ dataset_cfg.update(
+ test_mode=False,
+ split='training',
+ modality=dict(
+ use_lidar=True,
+ use_depth=False,
+ use_lidar_intensity=True,
+ use_camera=False,
+ ),
+ pipeline=[
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args)
+ ])
+
+ dataset = build_dataset(dataset_cfg)
+
+ if database_save_path is None:
+ database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+ if db_info_save_path is None:
+ db_info_save_path = osp.join(data_path,
+ f'{info_prefix}_dbinfos_train.pkl')
+ mmcv.mkdir_or_exist(database_save_path)
+ all_db_infos = dict()
+ if with_mask:
+ coco = COCO(osp.join(data_path, mask_anno_path))
+ imgIds = coco.getImgIds()
+ file2id = dict()
+ for i in imgIds:
+ info = coco.loadImgs([i])[0]
+ file2id.update({info['file_name']: i})
+
+ group_counter = 0
+ for j in track_iter_progress(list(range(len(dataset)))):
+ input_dict = dataset.get_data_info(j)
+ dataset.pre_pipeline(input_dict)
+ example = dataset.pipeline(input_dict)
+ annos = example['ann_info']
+ image_idx = example['sample_idx']
+ points = example['points'].tensor.numpy()
+ gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
+ names = annos['gt_names']
+ group_dict = dict()
+ if 'group_ids' in annos:
+ group_ids = annos['group_ids']
+ else:
+ group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+ difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+ if 'difficulty' in annos:
+ difficulty = annos['difficulty']
+
+ num_obj = gt_boxes_3d.shape[0]
+ point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+ if with_mask:
+ # prepare masks
+ gt_boxes = annos['gt_bboxes']
+ img_path = osp.split(example['img_info']['filename'])[-1]
+ if img_path not in file2id.keys():
+ print(f'skip image {img_path} for empty mask')
+ continue
+ img_id = file2id[img_path]
+ kins_annIds = coco.getAnnIds(imgIds=img_id)
+ kins_raw_info = coco.loadAnns(kins_annIds)
+ kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+ h, w = annos['img_shape'][:2]
+ gt_masks = [
+ _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+ ]
+ # get mask inds based on iou mapping
+ bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+ mask_inds = bbox_iou.argmax(axis=0)
+ valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+ # mask the image
+ # use more precise crop when it is ready
+ # object_img_patches = np.ascontiguousarray(
+ # np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+ # crop image patches using roi_align
+ # object_img_patches = crop_image_patch_v2(
+ # torch.Tensor(gt_boxes),
+ # torch.Tensor(mask_inds).long(), object_img_patches)
+ object_img_patches, object_masks = crop_image_patch(
+ gt_boxes, gt_masks, mask_inds, annos['img'])
+
+ for i in range(num_obj):
+ filename = f'{image_idx}_{names[i]}_{i}.bin'
+ abs_filepath = osp.join(database_save_path, filename)
+ rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+ # save point clouds and image patches for each object
+ gt_points = points[point_indices[:, i]]
+ gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+ if with_mask:
+ if object_masks[i].sum() == 0 or not valid_inds[i]:
+ # Skip object for empty or invalid mask
+ continue
+ img_patch_path = abs_filepath + '.png'
+ mask_patch_path = abs_filepath + '.mask.png'
+ mmcv.imwrite(object_img_patches[i], img_patch_path)
+ mmcv.imwrite(object_masks[i], mask_patch_path)
+
+ with open(abs_filepath, 'w') as f:
+ gt_points.tofile(f)
+
+ if (used_classes is None) or names[i] in used_classes:
+ db_info = {
+ 'name': names[i],
+ 'path': rel_filepath,
+ 'image_idx': image_idx,
+ 'gt_idx': i,
+ 'box3d_lidar': gt_boxes_3d[i],
+ 'num_points_in_gt': gt_points.shape[0],
+ 'difficulty': difficulty[i],
+ }
+ local_group_id = group_ids[i]
+ # if local_group_id >= 0:
+ if local_group_id not in group_dict:
+ group_dict[local_group_id] = group_counter
+ group_counter += 1
+ db_info['group_id'] = group_dict[local_group_id]
+ if 'score' in annos:
+ db_info['score'] = annos['score'][i]
+ if with_mask:
+ db_info.update({'box2d_camera': gt_boxes[i]})
+ if names[i] in all_db_infos:
+ all_db_infos[names[i]].append(db_info)
+ else:
+ all_db_infos[names[i]] = [db_info]
+
+ for k, v in all_db_infos.items():
+ print(f'load {len(v)} {k} database infos')
+
+ with open(db_info_save_path, 'wb') as f:
+ pickle.dump(all_db_infos, f)
diff --git a/adzoo/bevformer/data_converter/indoor_converter.py b/adzoo/bevformer/data_converter/indoor_converter.py
new file mode 100755
index 0000000..0aa5820
--- /dev/null
+++ b/adzoo/bevformer/data_converter/indoor_converter.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+
+from .s3dis_data_utils import S3DISData, S3DISSegData
+from .scannet_data_utils import ScanNetData, ScanNetSegData
+from .sunrgbd_data_utils import SUNRGBDData
+
+
+def create_indoor_info_file(data_path,
+ pkl_prefix='sunrgbd',
+ save_path=None,
+ use_v1=False,
+ workers=4):
+ """Create indoor information file.
+
+ Get information of the raw data and save it to the pkl file.
+
+ Args:
+ data_path (str): Path of the data.
+ pkl_prefix (str): Prefix of the pkl to be saved. Default: 'sunrgbd'.
+ save_path (str): Path of the pkl to be saved. Default: None.
+ use_v1 (bool): Whether to use v1. Default: False.
+ workers (int): Number of threads to be used. Default: 4.
+ """
+ assert os.path.exists(data_path)
+ assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \
+ f'unsupported indoor dataset {pkl_prefix}'
+ save_path = data_path if save_path is None else save_path
+ assert os.path.exists(save_path)
+
+ # generate infos for both detection and segmentation task
+ if pkl_prefix in ['sunrgbd', 'scannet']:
+ train_filename = os.path.join(save_path,
+ f'{pkl_prefix}_infos_train.pkl')
+ val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl')
+ if pkl_prefix == 'sunrgbd':
+ # SUN RGB-D has a train-val split
+ train_dataset = SUNRGBDData(
+ root_path=data_path, split='train', use_v1=use_v1)
+ val_dataset = SUNRGBDData(
+ root_path=data_path, split='val', use_v1=use_v1)
+ else:
+ # ScanNet has a train-val-test split
+ train_dataset = ScanNetData(root_path=data_path, split='train')
+ val_dataset = ScanNetData(root_path=data_path, split='val')
+ test_dataset = ScanNetData(root_path=data_path, split='test')
+ test_filename = os.path.join(save_path,
+ f'{pkl_prefix}_infos_test.pkl')
+
+ infos_train = train_dataset.get_infos(
+ num_workers=workers, has_label=True)
+ mmcv.dump(infos_train, train_filename, 'pkl')
+ print(f'{pkl_prefix} info train file is saved to {train_filename}')
+
+ infos_val = val_dataset.get_infos(num_workers=workers, has_label=True)
+ mmcv.dump(infos_val, val_filename, 'pkl')
+ print(f'{pkl_prefix} info val file is saved to {val_filename}')
+
+ if pkl_prefix == 'scannet':
+ infos_test = test_dataset.get_infos(
+ num_workers=workers, has_label=False)
+ mmcv.dump(infos_test, test_filename, 'pkl')
+ print(f'{pkl_prefix} info test file is saved to {test_filename}')
+
+ # generate infos for the semantic segmentation task
+ # e.g. re-sampled scene indexes and label weights
+ # scene indexes are used to re-sample rooms with different number of points
+ # label weights are used to balance classes with different number of points
+ if pkl_prefix == 'scannet':
+ # label weight computation function is adopted from
+ # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+ train_dataset = ScanNetSegData(
+ data_root=data_path,
+ ann_file=train_filename,
+ split='train',
+ num_points=8192,
+ label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+ # TODO: do we need to generate on val set?
+ val_dataset = ScanNetSegData(
+ data_root=data_path,
+ ann_file=val_filename,
+ split='val',
+ num_points=8192,
+ label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+ # no need to generate for test set
+ train_dataset.get_seg_infos()
+ val_dataset.get_seg_infos()
+ elif pkl_prefix == 's3dis':
+ # S3DIS doesn't have a fixed train-val split
+ # it has 6 areas instead, so we generate info file for each of them
+ # in training, we will use dataset to wrap different areas
+ splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]]
+ for split in splits:
+ dataset = S3DISData(root_path=data_path, split=split)
+ info = dataset.get_infos(num_workers=workers, has_label=True)
+ filename = os.path.join(save_path,
+ f'{pkl_prefix}_infos_{split}.pkl')
+ mmcv.dump(info, filename, 'pkl')
+ print(f'{pkl_prefix} info {split} file is saved to {filename}')
+ seg_dataset = S3DISSegData(
+ data_root=data_path,
+ ann_file=filename,
+ split=split,
+ num_points=4096,
+ label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+ seg_dataset.get_seg_infos()
diff --git a/adzoo/bevformer/data_converter/kitti_converter.py b/adzoo/bevformer/data_converter/kitti_converter.py
new file mode 100755
index 0000000..6ac2cef
--- /dev/null
+++ b/adzoo/bevformer/data_converter/kitti_converter.py
@@ -0,0 +1,546 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from collections import OrderedDict
+from nuscenes.utils.geometry_utils import view_points
+from pathlib import Path
+
+from mmcv.core.bbox import box_np_ops
+from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info
+from .nuscenes_converter import post_process_coords
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car')
+
+
+def convert_to_kitti_info_version2(info):
+ """convert kitti info v1 to v2 if possible.
+
+ Args:
+ info (dict): Info of the input kitti data.
+ - image (dict): image info
+ - calib (dict): calibration info
+ - point_cloud (dict): point cloud info
+ """
+ if 'image' not in info or 'calib' not in info or 'point_cloud' not in info:
+ info['image'] = {
+ 'image_shape': info['img_shape'],
+ 'image_idx': info['image_idx'],
+ 'image_path': info['img_path'],
+ }
+ info['calib'] = {
+ 'R0_rect': info['calib/R0_rect'],
+ 'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'],
+ 'P2': info['calib/P2'],
+ }
+ info['point_cloud'] = {
+ 'velodyne_path': info['velodyne_path'],
+ }
+
+
+def _read_imageset_file(path):
+ with open(path, 'r') as f:
+ lines = f.readlines()
+ return [int(line) for line in lines]
+
+
+def _calculate_num_points_in_gt(data_path,
+ infos,
+ relative_path,
+ remove_outside=True,
+ num_features=4):
+ for info in mmcv.track_iter_progress(infos):
+ pc_info = info['point_cloud']
+ image_info = info['image']
+ calib = info['calib']
+ if relative_path:
+ v_path = str(Path(data_path) / pc_info['velodyne_path'])
+ else:
+ v_path = pc_info['velodyne_path']
+ points_v = np.fromfile(
+ v_path, dtype=np.float32, count=-1).reshape([-1, num_features])
+ rect = calib['R0_rect']
+ Trv2c = calib['Tr_velo_to_cam']
+ P2 = calib['P2']
+ if remove_outside:
+ points_v = box_np_ops.remove_outside_points(
+ points_v, rect, Trv2c, P2, image_info['image_shape'])
+
+ # points_v = points_v[points_v[:, 0] > 0]
+ annos = info['annos']
+ num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+ # annos = kitti.filter_kitti_anno(annos, ['DontCare'])
+ dims = annos['dimensions'][:num_obj]
+ loc = annos['location'][:num_obj]
+ rots = annos['rotation_y'][:num_obj]
+ gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+ axis=1)
+ gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+ gt_boxes_camera, rect, Trv2c)
+ indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+ num_points_in_gt = indices.sum(0)
+ num_ignored = len(annos['dimensions']) - num_obj
+ num_points_in_gt = np.concatenate(
+ [num_points_in_gt, -np.ones([num_ignored])])
+ annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+
+
+def create_kitti_info_file(data_path,
+ pkl_prefix='kitti',
+ save_path=None,
+ relative_path=True):
+ """Create info file of KITTI dataset.
+
+ Given the raw data, generate its related info file in pkl format.
+
+ Args:
+ data_path (str): Path of the data root.
+ pkl_prefix (str): Prefix of the info file to be generated.
+ save_path (str): Path to save the info file.
+ relative_path (bool): Whether to use relative path.
+ """
+ imageset_folder = Path(data_path) / 'ImageSets'
+ train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+
+ val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+ test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+ print('Generate info. this may take several minutes.')
+ if save_path is None:
+ save_path = Path(data_path)
+ else:
+ save_path = Path(save_path)
+ kitti_infos_train = get_kitti_image_info(
+ data_path,
+ training=True,
+ velodyne=True,
+ calib=True,
+ image_ids=train_img_ids,
+ relative_path=relative_path)
+ _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
+ filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+ print(f'Kitti info train file is saved to {filename}')
+ mmcv.dump(kitti_infos_train, filename)
+ kitti_infos_val = get_kitti_image_info(
+ data_path,
+ training=True,
+ velodyne=True,
+ calib=True,
+ image_ids=val_img_ids,
+ relative_path=relative_path)
+ _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path)
+ filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+ print(f'Kitti info val file is saved to {filename}')
+ mmcv.dump(kitti_infos_val, filename)
+ filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+ print(f'Kitti info trainval file is saved to {filename}')
+ mmcv.dump(kitti_infos_train + kitti_infos_val, filename)
+
+ kitti_infos_test = get_kitti_image_info(
+ data_path,
+ training=False,
+ label_info=False,
+ velodyne=True,
+ calib=True,
+ image_ids=test_img_ids,
+ relative_path=relative_path)
+ filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+ print(f'Kitti info test file is saved to {filename}')
+ mmcv.dump(kitti_infos_test, filename)
+
+
+def create_waymo_info_file(data_path,
+ pkl_prefix='waymo',
+ save_path=None,
+ relative_path=True,
+ max_sweeps=5):
+ """Create info file of waymo dataset.
+
+ Given the raw data, generate its related info file in pkl format.
+
+ Args:
+ data_path (str): Path of the data root.
+ pkl_prefix (str): Prefix of the info file to be generated.
+ save_path (str | None): Path to save the info file.
+ relative_path (bool): Whether to use relative path.
+ max_sweeps (int): Max sweeps before the detection frame to be used.
+ """
+ imageset_folder = Path(data_path) / 'ImageSets'
+ train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+ # val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+ # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+ train_img_ids = [each for each in train_img_ids if each % 5 == 0]
+ print('Generate info. this may take several minutes.')
+ if save_path is None:
+ save_path = Path(data_path)
+ else:
+ save_path = Path(save_path)
+ waymo_infos_train = get_waymo_image_info(
+ data_path,
+ training=True,
+ velodyne=True,
+ calib=True,
+ pose=True,
+ image_ids=train_img_ids,
+ relative_path=relative_path,
+ max_sweeps=max_sweeps)
+ _calculate_num_points_in_gt(
+ data_path,
+ waymo_infos_train,
+ relative_path,
+ num_features=6,
+ remove_outside=False)
+ filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+ print(f'Waymo info train file is saved to {filename}')
+ mmcv.dump(waymo_infos_train, filename)
+ #
+ # waymo_infos_val = get_waymo_image_info(
+ # data_path,
+ # training=True,
+ # velodyne=True,
+ # calib=True,
+ # pose=True,
+ # image_ids=val_img_ids,
+ # relative_path=relative_path,
+ # max_sweeps=max_sweeps)
+ # _calculate_num_points_in_gt(
+ # data_path,
+ # waymo_infos_val,
+ # relative_path,
+ # num_features=6,
+ # remove_outside=False)
+ # filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+ # print(f'Waymo info val file is saved to {filename}')
+ # mmcv.dump(waymo_infos_val, filename)
+ # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+ # print(f'Waymo info trainval file is saved to {filename}')
+ # mmcv.dump(waymo_infos_train + waymo_infos_val, filename)
+ # waymo_infos_test = get_waymo_image_info(
+ # data_path,
+ # training=False,
+ # label_info=False,
+ # velodyne=True,
+ # calib=True,
+ # pose=True,
+ # image_ids=test_img_ids,
+ # relative_path=relative_path,
+ # max_sweeps=max_sweeps)
+ # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+ # print(f'Waymo info test file is saved to {filename}')
+ # mmcv.dump(waymo_infos_test, filename)
+
+
+def _create_reduced_point_cloud(data_path,
+ info_path,
+ save_path=None,
+ back=False,
+ num_features=4,
+ front_camera_id=2):
+ """Create reduced point clouds for given info.
+
+ Args:
+ data_path (str): Path of original data.
+ info_path (str): Path of data info.
+ save_path (str | None): Path to save reduced point cloud data.
+ Default: None.
+ back (bool): Whether to flip the points to back.
+ num_features (int): Number of point features. Default: 4.
+ front_camera_id (int): The referenced/front camera ID. Default: 2.
+ """
+ kitti_infos = mmcv.load(info_path)
+
+ for info in mmcv.track_iter_progress(kitti_infos):
+ pc_info = info['point_cloud']
+ image_info = info['image']
+ calib = info['calib']
+
+ v_path = pc_info['velodyne_path']
+ v_path = Path(data_path) / v_path
+ points_v = np.fromfile(
+ str(v_path), dtype=np.float32,
+ count=-1).reshape([-1, num_features])
+ rect = calib['R0_rect']
+ if front_camera_id == 2:
+ P2 = calib['P2']
+ else:
+ P2 = calib[f'P{str(front_camera_id)}']
+ Trv2c = calib['Tr_velo_to_cam']
+ # first remove z < 0 points
+ # keep = points_v[:, -1] > 0
+ # points_v = points_v[keep]
+ # then remove outside.
+ if back:
+ points_v[:, 0] = -points_v[:, 0]
+ points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2,
+ image_info['image_shape'])
+ if save_path is None:
+ save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced')
+ if not save_dir.exists():
+ save_dir.mkdir()
+ save_filename = save_dir / v_path.name
+ # save_filename = str(v_path) + '_reduced'
+ if back:
+ save_filename += '_back'
+ else:
+ save_filename = str(Path(save_path) / v_path.name)
+ if back:
+ save_filename += '_back'
+ with open(save_filename, 'w') as f:
+ points_v.tofile(f)
+
+
+def create_reduced_point_cloud(data_path,
+ pkl_prefix,
+ train_info_path=None,
+ val_info_path=None,
+ test_info_path=None,
+ save_path=None,
+ with_back=False):
+ """Create reduced point clouds for training/validation/testing.
+
+ Args:
+ data_path (str): Path of original data.
+ pkl_prefix (str): Prefix of info files.
+ train_info_path (str | None): Path of training set info.
+ Default: None.
+ val_info_path (str | None): Path of validation set info.
+ Default: None.
+ test_info_path (str | None): Path of test set info.
+ Default: None.
+ save_path (str | None): Path to save reduced point cloud data.
+ with_back (bool): Whether to flip the points to back.
+ """
+ if train_info_path is None:
+ train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl'
+ if val_info_path is None:
+ val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl'
+ if test_info_path is None:
+ test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl'
+
+ print('create reduced point cloud for training set')
+ _create_reduced_point_cloud(data_path, train_info_path, save_path)
+ print('create reduced point cloud for validation set')
+ _create_reduced_point_cloud(data_path, val_info_path, save_path)
+ print('create reduced point cloud for testing set')
+ _create_reduced_point_cloud(data_path, test_info_path, save_path)
+ if with_back:
+ _create_reduced_point_cloud(
+ data_path, train_info_path, save_path, back=True)
+ _create_reduced_point_cloud(
+ data_path, val_info_path, save_path, back=True)
+ _create_reduced_point_cloud(
+ data_path, test_info_path, save_path, back=True)
+
+
+def export_2d_annotation(root_path, info_path, mono3d=True):
+ """Export 2d annotation from the info file and raw data.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ info_path (str): Path of the info file.
+ mono3d (bool): Whether to export mono3d annotation. Default: True.
+ """
+ # get bbox annotations for camera
+ kitti_infos = mmcv.load(info_path)
+ cat2Ids = [
+ dict(id=kitti_categories.index(cat_name), name=cat_name)
+ for cat_name in kitti_categories
+ ]
+ coco_ann_id = 0
+ coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+ from os import path as osp
+ for info in mmcv.track_iter_progress(kitti_infos):
+ coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d)
+ (height, width,
+ _) = mmcv.imread(osp.join(root_path,
+ info['image']['image_path'])).shape
+ coco_2d_dict['images'].append(
+ dict(
+ file_name=info['image']['image_path'],
+ id=info['image']['image_idx'],
+ Tri2v=info['calib']['Tr_imu_to_velo'],
+ Trv2c=info['calib']['Tr_velo_to_cam'],
+ rect=info['calib']['R0_rect'],
+ cam_intrinsic=info['calib']['P2'],
+ width=width,
+ height=height))
+ for coco_info in coco_infos:
+ if coco_info is None:
+ continue
+ # add an empty key for coco format
+ coco_info['segmentation'] = []
+ coco_info['id'] = coco_ann_id
+ coco_2d_dict['annotations'].append(coco_info)
+ coco_ann_id += 1
+ if mono3d:
+ json_prefix = f'{info_path[:-4]}_mono3d'
+ else:
+ json_prefix = f'{info_path[:-4]}'
+ mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(info, occluded, mono3d=True):
+ """Get the 2D annotation records for a given info.
+
+ Args:
+ info: Information of the given sample data.
+ occluded: Integer (0, 1, 2, 3) indicating occlusion state: \
+ 0 = fully visible, 1 = partly occluded, 2 = largely occluded, \
+ 3 = unknown, -1 = DontCare
+ mono3d (bool): Whether to get boxes with mono3d annotation.
+
+ Return:
+ list[dict]: List of 2D annotation record that belongs to the input
+ `sample_data_token`.
+ """
+ # Get calibration information
+ P2 = info['calib']['P2']
+
+ repro_recs = []
+ # if no annotations in info (test dataset), then return
+ if 'annos' not in info:
+ return repro_recs
+
+ # Get all the annotation with the specified visibilties.
+ ann_dicts = info['annos']
+ mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+ for k in ann_dicts.keys():
+ ann_dicts[k] = ann_dicts[k][mask]
+
+ # convert dict of list to list of dict
+ ann_recs = []
+ for i in range(len(ann_dicts['occluded'])):
+ ann_rec = {}
+ for k in ann_dicts.keys():
+ ann_rec[k] = ann_dicts[k][i]
+ ann_recs.append(ann_rec)
+
+ for ann_idx, ann_rec in enumerate(ann_recs):
+ # Augment sample_annotation with token information.
+ ann_rec['sample_annotation_token'] = \
+ f"{info['image']['image_idx']}.{ann_idx}"
+ ann_rec['sample_data_token'] = info['image']['image_idx']
+ sample_data_token = info['image']['image_idx']
+
+ loc = ann_rec['location'][np.newaxis, :]
+ dim = ann_rec['dimensions'][np.newaxis, :]
+ rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+ # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+ dst = np.array([0.5, 0.5, 0.5])
+ src = np.array([0.5, 1.0, 0.5])
+ loc = loc + dim * (dst - src)
+ offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
+ / info['calib']['P2'][0, 0]
+ loc_3d = np.copy(loc)
+ loc_3d[0, 0] += offset
+ gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)
+
+ # Filter out the corners that are not in front of the calibrated
+ # sensor.
+ corners_3d = box_np_ops.center_to_corner_box3d(
+ gt_bbox_3d[:, :3],
+ gt_bbox_3d[:, 3:6],
+ gt_bbox_3d[:, 6], [0.5, 0.5, 0.5],
+ axis=1)
+ corners_3d = corners_3d[0].T # (1, 8, 3) -> (3, 8)
+ in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+ corners_3d = corners_3d[:, in_front]
+
+ # Project 3d box to 2d.
+ camera_intrinsic = P2
+ corner_coords = view_points(corners_3d, camera_intrinsic,
+ True).T[:, :2].tolist()
+
+ # Keep only corners that fall within the image.
+ final_coords = post_process_coords(corner_coords)
+
+ # Skip if the convex hull of the re-projected corners
+ # does not intersect the image canvas.
+ if final_coords is None:
+ continue
+ else:
+ min_x, min_y, max_x, max_y = final_coords
+
+ # Generate dictionary record to be included in the .json file.
+ repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+ sample_data_token,
+ info['image']['image_path'])
+
+ # If mono3d=True, add 3D annotations in camera coordinates
+ if mono3d and (repro_rec is not None):
+ repro_rec['bbox_cam3d'] = np.concatenate(
+ [loc_3d, dim, rot],
+ axis=1).astype(np.float32).squeeze().tolist()
+ repro_rec['velo_cam3d'] = -1 # no velocity in KITTI
+
+ center3d = np.array(loc).reshape([1, 3])
+ center2d = box_np_ops.points_cam2img(
+ center3d, camera_intrinsic, with_depth=True)
+ repro_rec['center2d'] = center2d.squeeze().tolist()
+ # normalized center2D + depth
+ # samples with depth < 0 will be removed
+ if repro_rec['center2d'][2] <= 0:
+ continue
+
+ repro_rec['attribute_name'] = -1 # no attribute in KITTI
+ repro_rec['attribute_id'] = -1
+
+ repro_recs.append(repro_rec)
+
+ return repro_recs
+
+
+def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename):
+ """Generate one 2D annotation record given various informations on top of
+ the 2D bounding box coordinates.
+
+ Args:
+ ann_rec (dict): Original 3d annotation record.
+ x1 (float): Minimum value of the x coordinate.
+ y1 (float): Minimum value of the y coordinate.
+ x2 (float): Maximum value of the x coordinate.
+ y2 (float): Maximum value of the y coordinate.
+ sample_data_token (str): Sample data token.
+ filename (str):The corresponding image file where the annotation
+ is present.
+
+ Returns:
+ dict: A sample 2D annotation record.
+ - file_name (str): flie name
+ - image_id (str): sample data token
+ - area (float): 2d box area
+ - category_name (str): category name
+ - category_id (int): category id
+ - bbox (list[float]): left x, top y, dx, dy of 2d box
+ - iscrowd (int): whether the area is crowd
+ """
+ repro_rec = OrderedDict()
+ repro_rec['sample_data_token'] = sample_data_token
+ coco_rec = dict()
+
+ key_mapping = {
+ 'name': 'category_name',
+ 'num_points_in_gt': 'num_lidar_pts',
+ 'sample_annotation_token': 'sample_annotation_token',
+ 'sample_data_token': 'sample_data_token',
+ }
+
+ for key, value in ann_rec.items():
+ if key in key_mapping.keys():
+ repro_rec[key_mapping[key]] = value
+
+ repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+ repro_rec['filename'] = filename
+
+ coco_rec['file_name'] = filename
+ coco_rec['image_id'] = sample_data_token
+ coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+ if repro_rec['category_name'] not in kitti_categories:
+ return None
+ cat_name = repro_rec['category_name']
+ coco_rec['category_name'] = cat_name
+ coco_rec['category_id'] = kitti_categories.index(cat_name)
+ coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+ coco_rec['iscrowd'] = 0
+
+ return coco_rec
diff --git a/adzoo/bevformer/data_converter/kitti_data_utils.py b/adzoo/bevformer/data_converter/kitti_data_utils.py
new file mode 100755
index 0000000..01538e0
--- /dev/null
+++ b/adzoo/bevformer/data_converter/kitti_data_utils.py
@@ -0,0 +1,554 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from collections import OrderedDict
+from concurrent import futures as futures
+from os import path as osp
+from pathlib import Path
+from skimage import io
+
+
+def get_image_index_str(img_idx, use_prefix_id=False):
+ if use_prefix_id:
+ return '{:07d}'.format(img_idx)
+ else:
+ return '{:06d}'.format(img_idx)
+
+
+def get_kitti_info_path(idx,
+ prefix,
+ info_type='image_2',
+ file_tail='.png',
+ training=True,
+ relative_path=True,
+ exist_check=True,
+ use_prefix_id=False):
+ img_idx_str = get_image_index_str(idx, use_prefix_id)
+ img_idx_str += file_tail
+ prefix = Path(prefix)
+ if training:
+ file_path = Path('training') / info_type / img_idx_str
+ else:
+ file_path = Path('testing') / info_type / img_idx_str
+ if exist_check and not (prefix / file_path).exists():
+ raise ValueError('file not exist: {}'.format(file_path))
+ if relative_path:
+ return str(file_path)
+ else:
+ return str(prefix / file_path)
+
+
+def get_image_path(idx,
+ prefix,
+ training=True,
+ relative_path=True,
+ exist_check=True,
+ info_type='image_2',
+ use_prefix_id=False):
+ return get_kitti_info_path(idx, prefix, info_type, '.png', training,
+ relative_path, exist_check, use_prefix_id)
+
+
+def get_label_path(idx,
+ prefix,
+ training=True,
+ relative_path=True,
+ exist_check=True,
+ info_type='label_2',
+ use_prefix_id=False):
+ return get_kitti_info_path(idx, prefix, info_type, '.txt', training,
+ relative_path, exist_check, use_prefix_id)
+
+
+def get_velodyne_path(idx,
+ prefix,
+ training=True,
+ relative_path=True,
+ exist_check=True,
+ use_prefix_id=False):
+ return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+ relative_path, exist_check, use_prefix_id)
+
+
+def get_calib_path(idx,
+ prefix,
+ training=True,
+ relative_path=True,
+ exist_check=True,
+ use_prefix_id=False):
+ return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+ relative_path, exist_check, use_prefix_id)
+
+
+def get_pose_path(idx,
+ prefix,
+ training=True,
+ relative_path=True,
+ exist_check=True,
+ use_prefix_id=False):
+ return get_kitti_info_path(idx, prefix, 'pose', '.txt', training,
+ relative_path, exist_check, use_prefix_id)
+
+
+def get_label_anno(label_path):
+ annotations = {}
+ annotations.update({
+ 'name': [],
+ 'truncated': [],
+ 'occluded': [],
+ 'alpha': [],
+ 'bbox': [],
+ 'dimensions': [],
+ 'location': [],
+ 'rotation_y': []
+ })
+ with open(label_path, 'r') as f:
+ lines = f.readlines()
+ # if len(lines) == 0 or len(lines[0]) < 15:
+ # content = []
+ # else:
+ content = [line.strip().split(' ') for line in lines]
+ num_objects = len([x[0] for x in content if x[0] != 'DontCare'])
+ annotations['name'] = np.array([x[0] for x in content])
+ num_gt = len(annotations['name'])
+ annotations['truncated'] = np.array([float(x[1]) for x in content])
+ annotations['occluded'] = np.array([int(x[2]) for x in content])
+ annotations['alpha'] = np.array([float(x[3]) for x in content])
+ annotations['bbox'] = np.array([[float(info) for info in x[4:8]]
+ for x in content]).reshape(-1, 4)
+ # dimensions will convert hwl format to standard lhw(camera) format.
+ annotations['dimensions'] = np.array([[float(info) for info in x[8:11]]
+ for x in content
+ ]).reshape(-1, 3)[:, [2, 0, 1]]
+ annotations['location'] = np.array([[float(info) for info in x[11:14]]
+ for x in content]).reshape(-1, 3)
+ annotations['rotation_y'] = np.array([float(x[14])
+ for x in content]).reshape(-1)
+ if len(content) != 0 and len(content[0]) == 16: # have score
+ annotations['score'] = np.array([float(x[15]) for x in content])
+ else:
+ annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+ index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
+ annotations['index'] = np.array(index, dtype=np.int32)
+ annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
+ return annotations
+
+
+def _extend_matrix(mat):
+ mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+ return mat
+
+
+def get_kitti_image_info(path,
+ training=True,
+ label_info=True,
+ velodyne=False,
+ calib=False,
+ image_ids=7481,
+ extend_matrix=True,
+ num_worker=8,
+ relative_path=True,
+ with_imageshape=True):
+ """
+ KITTI annotation format version 2:
+ {
+ [optional]points: [N, 3+] point cloud
+ [optional, for kitti]image: {
+ image_idx: ...
+ image_path: ...
+ image_shape: ...
+ }
+ point_cloud: {
+ num_features: 4
+ velodyne_path: ...
+ }
+ [optional, for kitti]calib: {
+ R0_rect: ...
+ Tr_velo_to_cam: ...
+ P2: ...
+ }
+ annos: {
+ location: [num_gt, 3] array
+ dimensions: [num_gt, 3] array
+ rotation_y: [num_gt] angle array
+ name: [num_gt] ground truth name array
+ [optional]difficulty: kitti difficulty
+ [optional]group_ids: used for multi-part object
+ }
+ }
+ """
+ root_path = Path(path)
+ if not isinstance(image_ids, list):
+ image_ids = list(range(image_ids))
+
+ def map_func(idx):
+ info = {}
+ pc_info = {'num_features': 4}
+ calib_info = {}
+
+ image_info = {'image_idx': idx}
+ annotations = None
+ if velodyne:
+ pc_info['velodyne_path'] = get_velodyne_path(
+ idx, path, training, relative_path)
+ image_info['image_path'] = get_image_path(idx, path, training,
+ relative_path)
+ if with_imageshape:
+ img_path = image_info['image_path']
+ if relative_path:
+ img_path = str(root_path / img_path)
+ image_info['image_shape'] = np.array(
+ io.imread(img_path).shape[:2], dtype=np.int32)
+ if label_info:
+ label_path = get_label_path(idx, path, training, relative_path)
+ if relative_path:
+ label_path = str(root_path / label_path)
+ annotations = get_label_anno(label_path)
+ info['image'] = image_info
+ info['point_cloud'] = pc_info
+ if calib:
+ calib_path = get_calib_path(
+ idx, path, training, relative_path=False)
+ with open(calib_path, 'r') as f:
+ lines = f.readlines()
+ P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+ ]).reshape([3, 4])
+ if extend_matrix:
+ P0 = _extend_matrix(P0)
+ P1 = _extend_matrix(P1)
+ P2 = _extend_matrix(P2)
+ P3 = _extend_matrix(P3)
+ R0_rect = np.array([
+ float(info) for info in lines[4].split(' ')[1:10]
+ ]).reshape([3, 3])
+ if extend_matrix:
+ rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+ rect_4x4[3, 3] = 1.
+ rect_4x4[:3, :3] = R0_rect
+ else:
+ rect_4x4 = R0_rect
+
+ Tr_velo_to_cam = np.array([
+ float(info) for info in lines[5].split(' ')[1:13]
+ ]).reshape([3, 4])
+ Tr_imu_to_velo = np.array([
+ float(info) for info in lines[6].split(' ')[1:13]
+ ]).reshape([3, 4])
+ if extend_matrix:
+ Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+ Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+ calib_info['P0'] = P0
+ calib_info['P1'] = P1
+ calib_info['P2'] = P2
+ calib_info['P3'] = P3
+ calib_info['R0_rect'] = rect_4x4
+ calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+ calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo
+ info['calib'] = calib_info
+
+ if annotations is not None:
+ info['annos'] = annotations
+ add_difficulty_to_annos(info)
+ return info
+
+ with futures.ThreadPoolExecutor(num_worker) as executor:
+ image_infos = executor.map(map_func, image_ids)
+
+ return list(image_infos)
+
+
+def get_waymo_image_info(path,
+ training=True,
+ label_info=True,
+ velodyne=False,
+ calib=False,
+ pose=False,
+ image_ids=7481,
+ extend_matrix=True,
+ num_worker=8,
+ relative_path=True,
+ with_imageshape=True,
+ max_sweeps=5):
+ """
+ Waymo annotation format version like KITTI:
+ {
+ [optional]points: [N, 3+] point cloud
+ [optional, for kitti]image: {
+ image_idx: ...
+ image_path: ...
+ image_shape: ...
+ }
+ point_cloud: {
+ num_features: 6
+ velodyne_path: ...
+ }
+ [optional, for kitti]calib: {
+ R0_rect: ...
+ Tr_velo_to_cam0: ...
+ P0: ...
+ }
+ annos: {
+ location: [num_gt, 3] array
+ dimensions: [num_gt, 3] array
+ rotation_y: [num_gt] angle array
+ name: [num_gt] ground truth name array
+ [optional]difficulty: kitti difficulty
+ [optional]group_ids: used for multi-part object
+ }
+ }
+ """
+ root_path = Path(path)
+ if not isinstance(image_ids, list):
+ image_ids = list(range(image_ids))
+
+ def map_func(idx):
+ info = {}
+ pc_info = {'num_features': 6}
+ calib_info = {}
+
+ image_info = {'image_idx': idx}
+ annotations = None
+ if velodyne:
+ pc_info['velodyne_path'] = get_velodyne_path(
+ idx, path, training, relative_path, use_prefix_id=True)
+ points = np.fromfile(
+ Path(path) / pc_info['velodyne_path'], dtype=np.float32)
+ points = np.copy(points).reshape(-1, pc_info['num_features'])
+ info['timestamp'] = np.int64(points[0, -1])
+ # values of the last dim are all the timestamp
+ image_info['image_path'] = get_image_path(
+ idx,
+ path,
+ training,
+ relative_path,
+ info_type='image_0',
+ use_prefix_id=True)
+ if with_imageshape:
+ img_path = image_info['image_path']
+ if relative_path:
+ img_path = str(root_path / img_path)
+ image_info['image_shape'] = np.array(
+ io.imread(img_path).shape[:2], dtype=np.int32)
+ if label_info:
+ label_path = get_label_path(
+ idx,
+ path,
+ training,
+ relative_path,
+ info_type='label_all',
+ use_prefix_id=True)
+ if relative_path:
+ label_path = str(root_path / label_path)
+ annotations = get_label_anno(label_path)
+ info['image'] = image_info
+ info['point_cloud'] = pc_info
+ if calib:
+ calib_path = get_calib_path(
+ idx, path, training, relative_path=False, use_prefix_id=True)
+ with open(calib_path, 'r') as f:
+ lines = f.readlines()
+ P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+ ]).reshape([3, 4])
+ P4 = np.array([float(info) for info in lines[4].split(' ')[1:13]
+ ]).reshape([3, 4])
+ if extend_matrix:
+ P0 = _extend_matrix(P0)
+ P1 = _extend_matrix(P1)
+ P2 = _extend_matrix(P2)
+ P3 = _extend_matrix(P3)
+ P4 = _extend_matrix(P4)
+ R0_rect = np.array([
+ float(info) for info in lines[5].split(' ')[1:10]
+ ]).reshape([3, 3])
+ if extend_matrix:
+ rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+ rect_4x4[3, 3] = 1.
+ rect_4x4[:3, :3] = R0_rect
+ else:
+ rect_4x4 = R0_rect
+
+ Tr_velo_to_cam = np.array([
+ float(info) for info in lines[6].split(' ')[1:13]
+ ]).reshape([3, 4])
+ if extend_matrix:
+ Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+ calib_info['P0'] = P0
+ calib_info['P1'] = P1
+ calib_info['P2'] = P2
+ calib_info['P3'] = P3
+ calib_info['P4'] = P4
+ calib_info['R0_rect'] = rect_4x4
+ calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+ info['calib'] = calib_info
+ if pose:
+ pose_path = get_pose_path(
+ idx, path, training, relative_path=False, use_prefix_id=True)
+ info['pose'] = np.loadtxt(pose_path)
+
+ if annotations is not None:
+ info['annos'] = annotations
+ info['annos']['camera_id'] = info['annos'].pop('score')
+ add_difficulty_to_annos(info)
+
+ sweeps = []
+ prev_idx = idx
+ while len(sweeps) < max_sweeps:
+ prev_info = {}
+ prev_idx -= 1
+ prev_info['velodyne_path'] = get_velodyne_path(
+ prev_idx,
+ path,
+ training,
+ relative_path,
+ exist_check=False,
+ use_prefix_id=True)
+ if_prev_exists = osp.exists(
+ Path(path) / prev_info['velodyne_path'])
+ if if_prev_exists:
+ prev_points = np.fromfile(
+ Path(path) / prev_info['velodyne_path'], dtype=np.float32)
+ prev_points = np.copy(prev_points).reshape(
+ -1, pc_info['num_features'])
+ prev_info['timestamp'] = np.int64(prev_points[0, -1])
+ prev_pose_path = get_pose_path(
+ prev_idx,
+ path,
+ training,
+ relative_path=False,
+ use_prefix_id=True)
+ prev_info['pose'] = np.loadtxt(prev_pose_path)
+ sweeps.append(prev_info)
+ else:
+ break
+ info['sweeps'] = sweeps
+
+ return info
+
+ with futures.ThreadPoolExecutor(num_worker) as executor:
+ image_infos = executor.map(map_func, image_ids)
+
+ return list(image_infos)
+
+
+def kitti_anno_to_label_file(annos, folder):
+ folder = Path(folder)
+ for anno in annos:
+ image_idx = anno['metadata']['image_idx']
+ label_lines = []
+ for j in range(anno['bbox'].shape[0]):
+ label_dict = {
+ 'name': anno['name'][j],
+ 'alpha': anno['alpha'][j],
+ 'bbox': anno['bbox'][j],
+ 'location': anno['location'][j],
+ 'dimensions': anno['dimensions'][j],
+ 'rotation_y': anno['rotation_y'][j],
+ 'score': anno['score'][j],
+ }
+ label_line = kitti_result_line(label_dict)
+ label_lines.append(label_line)
+ label_file = folder / f'{get_image_index_str(image_idx)}.txt'
+ label_str = '\n'.join(label_lines)
+ with open(label_file, 'w') as f:
+ f.write(label_str)
+
+
+def add_difficulty_to_annos(info):
+ min_height = [40, 25,
+ 25] # minimum height for evaluated groundtruth/detections
+ max_occlusion = [
+ 0, 1, 2
+ ] # maximum occlusion level of the groundtruth used for evaluation
+ max_trunc = [
+ 0.15, 0.3, 0.5
+ ] # maximum truncation level of the groundtruth used for evaluation
+ annos = info['annos']
+ dims = annos['dimensions'] # lhw format
+ bbox = annos['bbox']
+ height = bbox[:, 3] - bbox[:, 1]
+ occlusion = annos['occluded']
+ truncation = annos['truncated']
+ diff = []
+ easy_mask = np.ones((len(dims), ), dtype=np.bool)
+ moderate_mask = np.ones((len(dims), ), dtype=np.bool)
+ hard_mask = np.ones((len(dims), ), dtype=np.bool)
+ i = 0
+ for h, o, t in zip(height, occlusion, truncation):
+ if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+ easy_mask[i] = False
+ if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+ moderate_mask[i] = False
+ if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+ hard_mask[i] = False
+ i += 1
+ is_easy = easy_mask
+ is_moderate = np.logical_xor(easy_mask, moderate_mask)
+ is_hard = np.logical_xor(hard_mask, moderate_mask)
+
+ for i in range(len(dims)):
+ if is_easy[i]:
+ diff.append(0)
+ elif is_moderate[i]:
+ diff.append(1)
+ elif is_hard[i]:
+ diff.append(2)
+ else:
+ diff.append(-1)
+ annos['difficulty'] = np.array(diff, np.int32)
+ return diff
+
+
+def kitti_result_line(result_dict, precision=4):
+ prec_float = '{' + ':.{}f'.format(precision) + '}'
+ res_line = []
+ all_field_default = OrderedDict([
+ ('name', None),
+ ('truncated', -1),
+ ('occluded', -1),
+ ('alpha', -10),
+ ('bbox', None),
+ ('dimensions', [-1, -1, -1]),
+ ('location', [-1000, -1000, -1000]),
+ ('rotation_y', -10),
+ ('score', 0.0),
+ ])
+ res_dict = [(key, None) for key, val in all_field_default.items()]
+ res_dict = OrderedDict(res_dict)
+ for key, val in result_dict.items():
+ if all_field_default[key] is None and val is None:
+ raise ValueError('you must specify a value for {}'.format(key))
+ res_dict[key] = val
+
+ for key, val in res_dict.items():
+ if key == 'name':
+ res_line.append(val)
+ elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+ if val is None:
+ res_line.append(str(all_field_default[key]))
+ else:
+ res_line.append(prec_float.format(val))
+ elif key == 'occluded':
+ if val is None:
+ res_line.append(str(all_field_default[key]))
+ else:
+ res_line.append('{}'.format(val))
+ elif key in ['bbox', 'dimensions', 'location']:
+ if val is None:
+ res_line += [str(v) for v in all_field_default[key]]
+ else:
+ res_line += [prec_float.format(v) for v in val]
+ else:
+ raise ValueError('unknown key. supported key:{}'.format(
+ res_dict.keys()))
+ return ' '.join(res_line)
diff --git a/adzoo/bevformer/data_converter/lyft_converter.py b/adzoo/bevformer/data_converter/lyft_converter.py
new file mode 100755
index 0000000..db4f0fb
--- /dev/null
+++ b/adzoo/bevformer/data_converter/lyft_converter.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from logging import warning
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from os import path as osp
+from pyquaternion import Quaternion
+
+from mmcv.datasets import LyftDataset
+from .nuscenes_converter import (get_2d_boxes, get_available_scenes,
+ obtain_sensor2top)
+
+lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+ 'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+
+def create_lyft_infos(root_path,
+ info_prefix,
+ version='v1.01-train',
+ max_sweeps=10):
+ """Create info file of lyft dataset.
+
+ Given the raw data, generate its related info file in pkl format.
+
+ Args:
+ root_path (str): Path of the data root.
+ info_prefix (str): Prefix of the info file to be generated.
+ version (str): Version of the data.
+ Default: 'v1.01-train'
+ max_sweeps (int): Max number of sweeps.
+ Default: 10
+ """
+ lyft = Lyft(
+ data_path=osp.join(root_path, version),
+ json_path=osp.join(root_path, version, version),
+ verbose=True)
+ available_vers = ['v1.01-train', 'v1.01-test']
+ assert version in available_vers
+ if version == 'v1.01-train':
+ train_scenes = mmcv.list_from_file('data/lyft/train.txt')
+ val_scenes = mmcv.list_from_file('data/lyft/val.txt')
+ elif version == 'v1.01-test':
+ train_scenes = mmcv.list_from_file('data/lyft/test.txt')
+ val_scenes = []
+ else:
+ raise ValueError('unknown')
+
+ # filter existing scenes.
+ available_scenes = get_available_scenes(lyft)
+ available_scene_names = [s['name'] for s in available_scenes]
+ train_scenes = list(
+ filter(lambda x: x in available_scene_names, train_scenes))
+ val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+ train_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in train_scenes
+ ])
+ val_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in val_scenes
+ ])
+
+ test = 'test' in version
+ if test:
+ print(f'test scene: {len(train_scenes)}')
+ else:
+ print(f'train scene: {len(train_scenes)}, \
+ val scene: {len(val_scenes)}')
+ train_lyft_infos, val_lyft_infos = _fill_trainval_infos(
+ lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+ metadata = dict(version=version)
+ if test:
+ print(f'test sample: {len(train_lyft_infos)}')
+ data = dict(infos=train_lyft_infos, metadata=metadata)
+ info_name = f'{info_prefix}_infos_test'
+ info_path = osp.join(root_path, f'{info_name}.pkl')
+ mmcv.dump(data, info_path)
+ else:
+ print(f'train sample: {len(train_lyft_infos)}, \
+ val sample: {len(val_lyft_infos)}')
+ data = dict(infos=train_lyft_infos, metadata=metadata)
+ train_info_name = f'{info_prefix}_infos_train'
+ info_path = osp.join(root_path, f'{train_info_name}.pkl')
+ mmcv.dump(data, info_path)
+ data['infos'] = val_lyft_infos
+ val_info_name = f'{info_prefix}_infos_val'
+ info_val_path = osp.join(root_path, f'{val_info_name}.pkl')
+ mmcv.dump(data, info_val_path)
+
+
+def _fill_trainval_infos(lyft,
+ train_scenes,
+ val_scenes,
+ test=False,
+ max_sweeps=10):
+ """Generate the train/val infos from the raw data.
+
+ Args:
+ lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset.
+ train_scenes (list[str]): Basic information of training scenes.
+ val_scenes (list[str]): Basic information of validation scenes.
+ test (bool): Whether use the test mode. In the test mode, no
+ annotations can be accessed. Default: False.
+ max_sweeps (int): Max number of sweeps. Default: 10.
+
+ Returns:
+ tuple[list[dict]]: Information of training set and
+ validation set that will be saved to the info file.
+ """
+ train_lyft_infos = []
+ val_lyft_infos = []
+
+ for sample in mmcv.track_iter_progress(lyft.sample):
+ lidar_token = sample['data']['LIDAR_TOP']
+ sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+ cs_record = lyft.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token'])
+ abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token)
+ # nuScenes devkit returns more convenient relative paths while
+ # lyft devkit returns absolute paths
+ abs_lidar_path = str(abs_lidar_path) # absolute path
+ lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1]
+ # relative path
+
+ mmcv.check_file_exist(lidar_path)
+
+ info = {
+ 'lidar_path': lidar_path,
+ 'token': sample['token'],
+ 'sweeps': [],
+ 'cams': dict(),
+ 'lidar2ego_translation': cs_record['translation'],
+ 'lidar2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sample['timestamp'],
+ }
+
+ l2e_r = info['lidar2ego_rotation']
+ l2e_t = info['lidar2ego_translation']
+ e2g_r = info['ego2global_rotation']
+ e2g_t = info['ego2global_translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ # obtain 6 image's information per frame
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ for cam in camera_types:
+ cam_token = sample['data'][cam]
+ cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token)
+ cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat,
+ e2g_t, e2g_r_mat, cam)
+ cam_info.update(cam_intrinsic=cam_intrinsic)
+ info['cams'].update({cam: cam_info})
+
+ # obtain sweeps for a single key-frame
+ sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+ sweeps = []
+ while len(sweeps) < max_sweeps:
+ if not sd_rec['prev'] == '':
+ sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t,
+ l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+ sweeps.append(sweep)
+ sd_rec = lyft.get('sample_data', sd_rec['prev'])
+ else:
+ break
+ info['sweeps'] = sweeps
+ # obtain annotation
+ if not test:
+ annotations = [
+ lyft.get('sample_annotation', token)
+ for token in sample['anns']
+ ]
+ locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+ dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+ rots = np.array([b.orientation.yaw_pitch_roll[0]
+ for b in boxes]).reshape(-1, 1)
+
+ names = [b.name for b in boxes]
+ for i in range(len(names)):
+ if names[i] in LyftDataset.NameMapping:
+ names[i] = LyftDataset.NameMapping[names[i]]
+ names = np.array(names)
+
+ # we need to convert rot to SECOND format.
+ gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+ assert len(gt_boxes) == len(
+ annotations), f'{len(gt_boxes)}, {len(annotations)}'
+ info['gt_boxes'] = gt_boxes
+ info['gt_names'] = names
+ info['num_lidar_pts'] = np.array(
+ [a['num_lidar_pts'] for a in annotations])
+ info['num_radar_pts'] = np.array(
+ [a['num_radar_pts'] for a in annotations])
+
+ if sample['scene_token'] in train_scenes:
+ train_lyft_infos.append(info)
+ else:
+ val_lyft_infos.append(info)
+
+ return train_lyft_infos, val_lyft_infos
+
+
+def export_2d_annotation(root_path, info_path, version):
+ """Export 2d annotation from the info file and raw data.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ info_path (str): Path of the info file.
+ version (str): Dataset version.
+ """
+ warning.warn('DeprecationWarning: 2D annotations are not used on the '
+ 'Lyft dataset. The function export_2d_annotation will be '
+ 'deprecated.')
+ # get bbox annotations for camera
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ lyft_infos = mmcv.load(info_path)['infos']
+ lyft = Lyft(
+ data_path=osp.join(root_path, version),
+ json_path=osp.join(root_path, version, version),
+ verbose=True)
+ # info_2d_list = []
+ cat2Ids = [
+ dict(id=lyft_categories.index(cat_name), name=cat_name)
+ for cat_name in lyft_categories
+ ]
+ coco_ann_id = 0
+ coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+ for info in mmcv.track_iter_progress(lyft_infos):
+ for cam in camera_types:
+ cam_info = info['cams'][cam]
+ coco_infos = get_2d_boxes(
+ lyft,
+ cam_info['sample_data_token'],
+ visibilities=['', '1', '2', '3', '4'])
+ (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+ coco_2d_dict['images'].append(
+ dict(
+ file_name=cam_info['data_path'],
+ id=cam_info['sample_data_token'],
+ width=width,
+ height=height))
+ for coco_info in coco_infos:
+ if coco_info is None:
+ continue
+ # add an empty key for coco format
+ coco_info['segmentation'] = []
+ coco_info['id'] = coco_ann_id
+ coco_2d_dict['annotations'].append(coco_info)
+ coco_ann_id += 1
+ mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
diff --git a/adzoo/bevformer/data_converter/lyft_data_fixer.py b/adzoo/bevformer/data_converter/lyft_data_fixer.py
new file mode 100755
index 0000000..4207049
--- /dev/null
+++ b/adzoo/bevformer/data_converter/lyft_data_fixer.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import os
+
+
+def fix_lyft(root_folder='./data/lyft', version='v1.01'):
+ # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000 # noqa
+ lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin'
+ root_folder = os.path.join(root_folder, f'{version}-train')
+ lidar_path = os.path.join(root_folder, lidar_path)
+ assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
+ f'dataset and make sure {lidar_path} is present.'
+ points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
+ try:
+ points.reshape([-1, 5])
+ print(f'This fix is not required for version {version}.')
+ except ValueError:
+ new_points = np.array(list(points) + [100.0, 1.0], dtype='float32')
+ new_points.tofile(lidar_path)
+ print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.')
+
+
+parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser')
+parser.add_argument(
+ '--root-folder',
+ type=str,
+ default='./data/lyft',
+ help='specify the root path of Lyft dataset')
+parser.add_argument(
+ '--version',
+ type=str,
+ default='v1.01',
+ help='specify Lyft dataset version')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+ fix_lyft(root_folder=args.root_folder, version=args.version)
diff --git a/adzoo/bevformer/data_converter/nuimage_converter.py b/adzoo/bevformer/data_converter/nuimage_converter.py
new file mode 100755
index 0000000..92be1de
--- /dev/null
+++ b/adzoo/bevformer/data_converter/nuimage_converter.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import base64
+import mmcv
+import numpy as np
+from nuimages import NuImages
+from nuimages.utils.utils import mask_decode, name_to_index_mapping
+from os import path as osp
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+ 'barrier')
+
+NAME_MAPPING = {
+ 'movable_object.barrier': 'barrier',
+ 'vehicle.bicycle': 'bicycle',
+ 'vehicle.bus.bendy': 'bus',
+ 'vehicle.bus.rigid': 'bus',
+ 'vehicle.car': 'car',
+ 'vehicle.construction': 'construction_vehicle',
+ 'vehicle.motorcycle': 'motorcycle',
+ 'human.pedestrian.adult': 'pedestrian',
+ 'human.pedestrian.child': 'pedestrian',
+ 'human.pedestrian.construction_worker': 'pedestrian',
+ 'human.pedestrian.police_officer': 'pedestrian',
+ 'movable_object.trafficcone': 'traffic_cone',
+ 'vehicle.trailer': 'trailer',
+ 'vehicle.truck': 'truck',
+}
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Data converter arg parser')
+ parser.add_argument(
+ '--data-root',
+ type=str,
+ default='./data/nuimages',
+ help='specify the root path of dataset')
+ parser.add_argument(
+ '--version',
+ type=str,
+ nargs='+',
+ default=['v1.0-mini'],
+ required=False,
+ help='specify the dataset version')
+ parser.add_argument(
+ '--out-dir',
+ type=str,
+ default='./data/nuimages/annotations/',
+ required=False,
+ help='path to save the exported json')
+ parser.add_argument(
+ '--nproc',
+ type=int,
+ default=4,
+ required=False,
+ help='workers to process semantic masks')
+ parser.add_argument('--extra-tag', type=str, default='nuimages')
+ args = parser.parse_args()
+ return args
+
+
+def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root):
+ """Get semantic segmentation map for an image.
+
+ Args:
+ nuim (obj:`NuImages`): NuImages dataset object
+ img_info (dict): Meta information of img
+
+ Returns:
+ np.ndarray: Semantic segmentation map of the image
+ """
+ sd_token = img_info['token']
+ image_id = img_info['id']
+ name_to_index = name_to_index_mapping(nuim.category)
+
+ # Get image data.
+ width, height = img_info['width'], img_info['height']
+ semseg_mask = np.zeros((height, width)).astype('uint8')
+
+ # Load stuff / surface regions.
+ surface_anns = [
+ o for o in nuim.surface_ann if o['sample_data_token'] == sd_token
+ ]
+
+ # Draw stuff / surface regions.
+ for ann in surface_anns:
+ # Get color and mask.
+ category_token = ann['category_token']
+ category_name = nuim.get('category', category_token)['name']
+ if ann['mask'] is None:
+ continue
+ mask = mask_decode(ann['mask'])
+
+ # Draw mask for semantic segmentation.
+ semseg_mask[mask == 1] = name_to_index[category_name]
+
+ # Load object instances.
+ object_anns = [
+ o for o in nuim.object_ann if o['sample_data_token'] == sd_token
+ ]
+
+ # Sort by token to ensure that objects always appear in the
+ # instance mask in the same order.
+ object_anns = sorted(object_anns, key=lambda k: k['token'])
+
+ # Draw object instances.
+ # The 0 index is reserved for background; thus, the instances
+ # should start from index 1.
+ annotations = []
+ for i, ann in enumerate(object_anns, start=1):
+ # Get color, box, mask and name.
+ category_token = ann['category_token']
+ category_name = nuim.get('category', category_token)['name']
+ if ann['mask'] is None:
+ continue
+ mask = mask_decode(ann['mask'])
+
+ # Draw masks for semantic segmentation and instance segmentation.
+ semseg_mask[mask == 1] = name_to_index[category_name]
+
+ if category_name in NAME_MAPPING:
+ cat_name = NAME_MAPPING[category_name]
+ cat_id = cat2id[cat_name]
+
+ x_min, y_min, x_max, y_max = ann['bbox']
+ # encode calibrated instance mask
+ mask_anno = dict()
+ mask_anno['counts'] = base64.b64decode(
+ ann['mask']['counts']).decode()
+ mask_anno['size'] = ann['mask']['size']
+
+ data_anno = dict(
+ image_id=image_id,
+ category_id=cat_id,
+ bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+ area=(x_max - x_min) * (y_max - y_min),
+ segmentation=mask_anno,
+ iscrowd=0)
+ annotations.append(data_anno)
+
+ # after process, save semantic masks
+ img_filename = img_info['file_name']
+ seg_filename = img_filename.replace('jpg', 'png')
+ seg_filename = osp.join(seg_root, seg_filename)
+ mmcv.imwrite(semseg_mask, seg_filename)
+ return annotations, np.max(semseg_mask)
+
+
+def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc):
+ print('Process category information')
+ categories = []
+ categories = [
+ dict(id=nus_categories.index(cat_name), name=cat_name)
+ for cat_name in nus_categories
+ ]
+ cat2id = {k_v['name']: k_v['id'] for k_v in categories}
+
+ images = []
+ print('Process image meta information...')
+ for sample_info in mmcv.track_iter_progress(nuim.sample_data):
+ if sample_info['is_key_frame']:
+ img_idx = len(images)
+ images.append(
+ dict(
+ id=img_idx,
+ token=sample_info['token'],
+ file_name=sample_info['filename'],
+ width=sample_info['width'],
+ height=sample_info['height']))
+
+ seg_root = f'{out_dir}semantic_masks'
+ mmcv.mkdir_or_exist(seg_root)
+ mmcv.mkdir_or_exist(osp.join(data_root, 'calibrated'))
+
+ global process_img_anno
+
+ def process_img_anno(img_info):
+ single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id,
+ out_dir, data_root,
+ seg_root)
+ return single_img_annos, max_cls_id
+
+ print('Process img annotations...')
+ if nproc > 1:
+ outputs = mmcv.track_parallel_progress(
+ process_img_anno, images, nproc=nproc)
+ else:
+ outputs = []
+ for img_info in mmcv.track_iter_progress(images):
+ outputs.append(process_img_anno(img_info))
+
+ # Determine the index of object annotation
+ print('Process annotation information...')
+ annotations = []
+ max_cls_ids = []
+ for single_img_annos, max_cls_id in outputs:
+ max_cls_ids.append(max_cls_id)
+ for img_anno in single_img_annos:
+ img_anno.update(id=len(annotations))
+ annotations.append(img_anno)
+
+ max_cls_id = max(max_cls_ids)
+ print(f'Max ID of class in the semantic map: {max_cls_id}')
+
+ coco_format_json = dict(
+ images=images, annotations=annotations, categories=categories)
+
+ mmcv.mkdir_or_exist(out_dir)
+ out_file = osp.join(out_dir, f'{extra_tag}_{version}.json')
+ print(f'Annotation dumped to {out_file}')
+ mmcv.dump(coco_format_json, out_file)
+
+
+def main():
+ args = parse_args()
+ for version in args.version:
+ nuim = NuImages(
+ dataroot=args.data_root, version=version, verbose=True, lazy=True)
+ export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag,
+ version, args.nproc)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/data_converter/nuscenes_converter.py b/adzoo/bevformer/data_converter/nuscenes_converter.py
new file mode 100755
index 0000000..c3c071e
--- /dev/null
+++ b/adzoo/bevformer/data_converter/nuscenes_converter.py
@@ -0,0 +1,674 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import numpy as np
+import os
+from collections import OrderedDict
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from os import path as osp
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+
+from mmcv.core.bbox.box_np_ops import points_cam2img
+from mmcv.datasets import NuScenesDataset
+from mmcv.fileio.io import dump, load
+from mmcv.image.io import imread
+from mmcv.utils import is_filepath, check_file_exist, track_iter_progress
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+ 'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+ 'pedestrian.moving', 'pedestrian.standing',
+ 'pedestrian.sitting_lying_down', 'vehicle.moving',
+ 'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+ out_path,
+ can_bus_root_path,
+ info_prefix,
+ version='v1.0-trainval',
+ max_sweeps=10):
+ """Create info file of nuscene dataset.
+
+ Given the raw data, generate its related info file in pkl format.
+
+ Args:
+ root_path (str): Path of the data root.
+ info_prefix (str): Prefix of the info file to be generated.
+ version (str): Version of the data.
+ Default: 'v1.0-trainval'
+ max_sweeps (int): Max number of sweeps.
+ Default: 10
+ """
+ from nuscenes.nuscenes import NuScenes
+ from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+ print(version, root_path)
+ nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+ nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+ from nuscenes.utils import splits
+ available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+ assert version in available_vers
+ if version == 'v1.0-trainval':
+ train_scenes = splits.train
+ val_scenes = splits.val
+ elif version == 'v1.0-test':
+ train_scenes = splits.test
+ val_scenes = []
+ elif version == 'v1.0-mini':
+ train_scenes = splits.mini_train
+ val_scenes = splits.mini_val
+ else:
+ raise ValueError('unknown')
+
+ # filter existing scenes.
+ available_scenes = get_available_scenes(nusc)
+ available_scene_names = [s['name'] for s in available_scenes]
+ train_scenes = list(
+ filter(lambda x: x in available_scene_names, train_scenes))
+ val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+ train_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in train_scenes
+ ])
+ val_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in val_scenes
+ ])
+
+ test = 'test' in version
+ if test:
+ print('test scene: {}'.format(len(train_scenes)))
+ else:
+ print('train scene: {}, val scene: {}'.format(
+ len(train_scenes), len(val_scenes)))
+
+ train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+ nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+ metadata = dict(version=version)
+ if test:
+ print('test sample: {}'.format(len(train_nusc_infos)))
+ data = dict(infos=train_nusc_infos, metadata=metadata)
+ info_path = osp.join(out_path,
+ '{}_infos_temporal_test.pkl'.format(info_prefix))
+ dump(data, info_path)
+ else:
+ print('train sample: {}, val sample: {}'.format(
+ len(train_nusc_infos), len(val_nusc_infos)))
+ data = dict(infos=train_nusc_infos, metadata=metadata)
+ info_path = osp.join(out_path,
+ '{}_infos_temporal_train.pkl'.format(info_prefix))
+ dump(data, info_path)
+ data['infos'] = val_nusc_infos
+ info_val_path = osp.join(out_path,
+ '{}_infos_temporal_val.pkl'.format(info_prefix))
+ dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+ """Get available scenes from the input nuscenes class.
+
+ Given the raw data, get the information of available scenes for
+ further info generation.
+
+ Args:
+ nusc (class): Dataset class in the nuScenes dataset.
+
+ Returns:
+ available_scenes (list[dict]): List of basic information for the
+ available scenes.
+ """
+ available_scenes = []
+ print('total scene num: {}'.format(len(nusc.scene)))
+ for scene in nusc.scene:
+ scene_token = scene['token']
+ scene_rec = nusc.get('scene', scene_token)
+ sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+ sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+ has_more_frames = True
+ scene_not_exist = False
+ while has_more_frames:
+ lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+ lidar_path = str(lidar_path)
+ if os.getcwd() in lidar_path:
+ # path from lyftdataset is absolute path
+ lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+ # relative path
+ if not is_filepath(lidar_path):
+ scene_not_exist = True
+ break
+ else:
+ break
+ if scene_not_exist:
+ continue
+ available_scenes.append(scene)
+ print('exist scene num: {}'.format(len(available_scenes)))
+ return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+ scene_name = nusc.get('scene', sample['scene_token'])['name']
+ sample_timestamp = sample['timestamp']
+ try:
+ pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+ except:
+ return np.zeros(18) # server scenes do not have can bus information.
+ can_bus = []
+ # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+ last_pose = pose_list[0]
+ for i, pose in enumerate(pose_list):
+ if pose['utime'] > sample_timestamp:
+ break
+ last_pose = pose
+ _ = last_pose.pop('utime') # useless
+ pos = last_pose.pop('pos')
+ rotation = last_pose.pop('orientation')
+ can_bus.extend(pos)
+ can_bus.extend(rotation)
+ for key in last_pose.keys():
+ can_bus.extend(pose[key]) # 16 elements
+ can_bus.extend([0., 0.])
+ return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+ nusc_can_bus,
+ train_scenes,
+ val_scenes,
+ test=False,
+ max_sweeps=10):
+ """Generate the train/val infos from the raw data.
+
+ Args:
+ nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+ train_scenes (list[str]): Basic information of training scenes.
+ val_scenes (list[str]): Basic information of validation scenes.
+ test (bool): Whether use the test mode. In the test mode, no
+ annotations can be accessed. Default: False.
+ max_sweeps (int): Max number of sweeps. Default: 10.
+
+ Returns:
+ tuple[list[dict]]: Information of training set and validation set
+ that will be saved to the info file.
+ """
+ train_nusc_infos = []
+ val_nusc_infos = []
+ frame_idx = 0
+ for sample in track_iter_progress(nusc.sample):
+ lidar_token = sample['data']['LIDAR_TOP']
+ sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+ check_file_exist(lidar_path)
+ can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+ ##
+ info = {
+ 'lidar_path': lidar_path,
+ 'token': sample['token'],
+ 'prev': sample['prev'],
+ 'next': sample['next'],
+ 'can_bus': can_bus,
+ 'frame_idx': frame_idx, # temporal related info
+ 'sweeps': [],
+ 'cams': dict(),
+ 'scene_token': sample['scene_token'], # temporal related info
+ 'lidar2ego_translation': cs_record['translation'],
+ 'lidar2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sample['timestamp'],
+ }
+
+ if sample['next'] == '':
+ frame_idx = 0
+ else:
+ frame_idx += 1
+
+ l2e_r = info['lidar2ego_rotation']
+ l2e_t = info['lidar2ego_translation']
+ e2g_r = info['ego2global_rotation']
+ e2g_t = info['ego2global_translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ # obtain 6 image's information per frame
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ for cam in camera_types:
+ cam_token = sample['data'][cam]
+ cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+ cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+ e2g_t, e2g_r_mat, cam)
+ cam_info.update(cam_intrinsic=cam_intrinsic)
+ info['cams'].update({cam: cam_info})
+
+ # obtain sweeps for a single key-frame
+ sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ sweeps = []
+ while len(sweeps) < max_sweeps:
+ if not sd_rec['prev'] == '':
+ sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+ l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+ sweeps.append(sweep)
+ sd_rec = nusc.get('sample_data', sd_rec['prev'])
+ else:
+ break
+ info['sweeps'] = sweeps
+ # obtain annotation
+ if not test:
+ annotations = [
+ nusc.get('sample_annotation', token)
+ for token in sample['anns']
+ ]
+ locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+ dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+ rots = np.array([b.orientation.yaw_pitch_roll[0]
+ for b in boxes]).reshape(-1, 1)
+ velocity = np.array(
+ [nusc.box_velocity(token)[:2] for token in sample['anns']])
+ valid_flag = np.array(
+ [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+ for anno in annotations],
+ dtype=bool).reshape(-1)
+ # convert velo from global to lidar
+ for i in range(len(boxes)):
+ velo = np.array([*velocity[i], 0.0])
+ velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+ l2e_r_mat).T
+ velocity[i] = velo[:2]
+
+ names = [b.name for b in boxes]
+ for i in range(len(names)):
+ if names[i] in NuScenesDataset.NameMapping:
+ names[i] = NuScenesDataset.NameMapping[names[i]]
+ names = np.array(names)
+ # we need to convert rot to SECOND format.
+ gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+ assert len(gt_boxes) == len(
+ annotations), f'{len(gt_boxes)}, {len(annotations)}'
+ info['gt_boxes'] = gt_boxes
+ info['gt_names'] = names
+ info['gt_velocity'] = velocity.reshape(-1, 2)
+ info['num_lidar_pts'] = np.array(
+ [a['num_lidar_pts'] for a in annotations])
+ info['num_radar_pts'] = np.array(
+ [a['num_radar_pts'] for a in annotations])
+ info['valid_flag'] = valid_flag
+
+ if sample['scene_token'] in train_scenes:
+ train_nusc_infos.append(info)
+ else:
+ val_nusc_infos.append(info)
+
+ return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+ sensor_token,
+ l2e_t,
+ l2e_r_mat,
+ e2g_t,
+ e2g_r_mat,
+ sensor_type='lidar'):
+ """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+ Args:
+ nusc (class): Dataset class in the nuScenes dataset.
+ sensor_token (str): Sample data token corresponding to the
+ specific sensor type.
+ l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+ l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+ in shape (3, 3).
+ e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+ e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+ in shape (3, 3).
+ sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+ Returns:
+ sweep (dict): Sweep information after transformation.
+ """
+ sd_rec = nusc.get('sample_data', sensor_token)
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+ if os.getcwd() in data_path: # path from lyftdataset is absolute path
+ data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path
+ sweep = {
+ 'data_path': data_path,
+ 'type': sensor_type,
+ 'sample_data_token': sd_rec['token'],
+ 'sensor2ego_translation': cs_record['translation'],
+ 'sensor2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sd_rec['timestamp']
+ }
+
+ l2e_r_s = sweep['sensor2ego_rotation']
+ l2e_t_s = sweep['sensor2ego_translation']
+ e2g_r_s = sweep['ego2global_rotation']
+ e2g_t_s = sweep['ego2global_translation']
+
+ # obtain the RT from sensor to Top LiDAR
+ # sweep->ego->global->ego'->lidar
+ l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+ e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+ R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+ ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+ sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T
+ sweep['sensor2lidar_translation'] = T
+ return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+ """Export 2d annotation from the info file and raw data.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ info_path (str): Path of the info file.
+ version (str): Dataset version.
+ mono3d (bool): Whether to export mono3d annotation. Default: True.
+ """
+ # get bbox annotations for camera
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ nusc_infos = load(info_path)['infos']
+ nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+ # info_2d_list = []
+ cat2Ids = [
+ dict(id=nus_categories.index(cat_name), name=cat_name)
+ for cat_name in nus_categories
+ ]
+ coco_ann_id = 0
+ coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+ for info in track_iter_progress(nusc_infos):
+ for cam in camera_types:
+ cam_info = info['cams'][cam]
+ coco_infos = get_2d_boxes(
+ nusc,
+ cam_info['sample_data_token'],
+ visibilities=['', '1', '2', '3', '4'],
+ mono3d=mono3d)
+ (height, width, _) = imread(cam_info['data_path']).shape
+ coco_2d_dict['images'].append(
+ dict(
+ file_name=cam_info['data_path'].split('data/nuscenes/')
+ [-1],
+ id=cam_info['sample_data_token'],
+ token=info['token'],
+ cam2ego_rotation=cam_info['sensor2ego_rotation'],
+ cam2ego_translation=cam_info['sensor2ego_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ ego2global_translation=info['ego2global_translation'],
+ cam_intrinsic=cam_info['cam_intrinsic'],
+ width=width,
+ height=height))
+ for coco_info in coco_infos:
+ if coco_info is None:
+ continue
+ # add an empty key for coco format
+ coco_info['segmentation'] = []
+ coco_info['id'] = coco_ann_id
+ coco_2d_dict['annotations'].append(coco_info)
+ coco_ann_id += 1
+ if mono3d:
+ json_prefix = f'{info_path[:-4]}_mono3d'
+ else:
+ json_prefix = f'{info_path[:-4]}'
+ dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+ sample_data_token: str,
+ visibilities: List[str],
+ mono3d=True):
+ """Get the 2D annotation records for a given `sample_data_token`.
+
+ Args:
+ sample_data_token (str): Sample data token belonging to a camera \
+ keyframe.
+ visibilities (list[str]): Visibility filter.
+ mono3d (bool): Whether to get boxes with mono3d annotation.
+
+ Return:
+ list[dict]: List of 2D annotation record that belongs to the input
+ `sample_data_token`.
+ """
+
+ # Get the sample data and the sample corresponding to that sample data.
+ sd_rec = nusc.get('sample_data', sample_data_token)
+
+ assert sd_rec[
+ 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+ ' for camera sample_data!'
+ if not sd_rec['is_key_frame']:
+ raise ValueError(
+ 'The 2D re-projections are available only for keyframes.')
+
+ s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+ # Get the calibrated sensor and ego pose
+ # record to get the transformation matrices.
+ cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+ pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+ # Get all the annotation with the specified visibilties.
+ ann_recs = [
+ nusc.get('sample_annotation', token) for token in s_rec['anns']
+ ]
+ ann_recs = [
+ ann_rec for ann_rec in ann_recs
+ if (ann_rec['visibility_token'] in visibilities)
+ ]
+
+ repro_recs = []
+
+ for ann_rec in ann_recs:
+ # Augment sample_annotation with token information.
+ ann_rec['sample_annotation_token'] = ann_rec['token']
+ ann_rec['sample_data_token'] = sample_data_token
+
+ # Get the box in global coordinates.
+ box = nusc.get_box(ann_rec['token'])
+
+ # Move them to the ego-pose frame.
+ box.translate(-np.array(pose_rec['translation']))
+ box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+ # Move them to the calibrated sensor frame.
+ box.translate(-np.array(cs_rec['translation']))
+ box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+ # Filter out the corners that are not in front of the calibrated
+ # sensor.
+ corners_3d = box.corners()
+ in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+ corners_3d = corners_3d[:, in_front]
+
+ # Project 3d box to 2d.
+ corner_coords = view_points(corners_3d, camera_intrinsic,
+ True).T[:, :2].tolist()
+
+ # Keep only corners that fall within the image.
+ final_coords = post_process_coords(corner_coords)
+
+ # Skip if the convex hull of the re-projected corners
+ # does not intersect the image canvas.
+ if final_coords is None:
+ continue
+ else:
+ min_x, min_y, max_x, max_y = final_coords
+
+ # Generate dictionary record to be included in the .json file.
+ repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+ sample_data_token, sd_rec['filename'])
+
+ # If mono3d=True, add 3D annotations in camera coordinates
+ if mono3d and (repro_rec is not None):
+ loc = box.center.tolist()
+
+ dim = box.wlh
+ dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw
+ dim = dim.tolist()
+
+ rot = box.orientation.yaw_pitch_roll[0]
+ rot = [-rot] # convert the rot to our cam coordinate
+
+ global_velo2d = nusc.box_velocity(box.token)[:2]
+ global_velo3d = np.array([*global_velo2d, 0.0])
+ e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+ c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+ cam_velo3d = global_velo3d @ np.linalg.inv(
+ e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+ velo = cam_velo3d[0::2].tolist()
+
+ repro_rec['bbox_cam3d'] = loc + dim + rot
+ repro_rec['velo_cam3d'] = velo
+
+ center3d = np.array(loc).reshape([1, 3])
+ center2d = points_cam2img(
+ center3d, camera_intrinsic, with_depth=True)
+ repro_rec['center2d'] = center2d.squeeze().tolist()
+ # normalized center2D + depth
+ # if samples with depth < 0 will be removed
+ if repro_rec['center2d'][2] <= 0:
+ continue
+
+ ann_token = nusc.get('sample_annotation',
+ box.token)['attribute_tokens']
+ if len(ann_token) == 0:
+ attr_name = 'None'
+ else:
+ attr_name = nusc.get('attribute', ann_token[0])['name']
+ attr_id = nus_attributes.index(attr_name)
+ repro_rec['attribute_name'] = attr_name
+ repro_rec['attribute_id'] = attr_id
+
+ repro_recs.append(repro_rec)
+
+ return repro_recs
+
+
+def post_process_coords(
+ corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+ """Get the intersection of the convex hull of the reprojected bbox corners
+ and the image canvas, return None if no intersection.
+
+ Args:
+ corner_coords (list[int]): Corner coordinates of reprojected
+ bounding box.
+ imsize (tuple[int]): Size of the image canvas.
+
+ Return:
+ tuple [float]: Intersection of the convex hull of the 2D box
+ corners and the image canvas.
+ """
+ polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+ img_canvas = box(0, 0, imsize[0], imsize[1])
+
+ if polygon_from_2d_box.intersects(img_canvas):
+ img_intersection = polygon_from_2d_box.intersection(img_canvas)
+ intersection_coords = np.array(
+ [coord for coord in img_intersection.exterior.coords])
+
+ min_x = min(intersection_coords[:, 0])
+ min_y = min(intersection_coords[:, 1])
+ max_x = max(intersection_coords[:, 0])
+ max_y = max(intersection_coords[:, 1])
+
+ return min_x, min_y, max_x, max_y
+ else:
+ return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+ sample_data_token: str, filename: str) -> OrderedDict:
+ """Generate one 2D annotation record given various informations on top of
+ the 2D bounding box coordinates.
+
+ Args:
+ ann_rec (dict): Original 3d annotation record.
+ x1 (float): Minimum value of the x coordinate.
+ y1 (float): Minimum value of the y coordinate.
+ x2 (float): Maximum value of the x coordinate.
+ y2 (float): Maximum value of the y coordinate.
+ sample_data_token (str): Sample data token.
+ filename (str):The corresponding image file where the annotation
+ is present.
+
+ Returns:
+ dict: A sample 2D annotation record.
+ - file_name (str): flie name
+ - image_id (str): sample data token
+ - area (float): 2d box area
+ - category_name (str): category name
+ - category_id (int): category id
+ - bbox (list[float]): left x, top y, dx, dy of 2d box
+ - iscrowd (int): whether the area is crowd
+ """
+ repro_rec = OrderedDict()
+ repro_rec['sample_data_token'] = sample_data_token
+ coco_rec = dict()
+
+ relevant_keys = [
+ 'attribute_tokens',
+ 'category_name',
+ 'instance_token',
+ 'next',
+ 'num_lidar_pts',
+ 'num_radar_pts',
+ 'prev',
+ 'sample_annotation_token',
+ 'sample_data_token',
+ 'visibility_token',
+ ]
+
+ for key, value in ann_rec.items():
+ if key in relevant_keys:
+ repro_rec[key] = value
+
+ repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+ repro_rec['filename'] = filename
+
+ coco_rec['file_name'] = filename
+ coco_rec['image_id'] = sample_data_token
+ coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+ if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+ return None
+ cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+ coco_rec['category_name'] = cat_name
+ coco_rec['category_id'] = nus_categories.index(cat_name)
+ coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+ coco_rec['iscrowd'] = 0
+
+ return coco_rec
diff --git a/adzoo/bevformer/data_converter/s3dis_data_utils.py b/adzoo/bevformer/data_converter/s3dis_data_utils.py
new file mode 100755
index 0000000..d2b6b77
--- /dev/null
+++ b/adzoo/bevformer/data_converter/s3dis_data_utils.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class S3DISData(object):
+ """S3DIS data.
+
+ Generate s3dis infos for s3dis_converter.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ split (str): Set split type of the data. Default: 'Area_1'.
+ """
+
+ def __init__(self, root_path, split='Area_1'):
+ self.root_dir = root_path
+ self.split = split
+ self.data_dir = osp.join(root_path,
+ 'Stanford3dDataset_v1.2_Aligned_Version')
+
+ # Following `GSDN `_, use 5 furniture
+ # classes for detection: table, chair, sofa, bookcase, board.
+ self.cat_ids = np.array([7, 8, 9, 10, 11])
+ self.cat_ids2class = {
+ cat_id: i
+ for i, cat_id in enumerate(list(self.cat_ids))
+ }
+
+ assert split in [
+ 'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6'
+ ]
+ self.sample_id_list = os.listdir(osp.join(self.data_dir,
+ split)) # conferenceRoom_1
+ for sample_id in self.sample_id_list:
+ if os.path.isfile(osp.join(self.data_dir, split, sample_id)):
+ self.sample_id_list.remove(sample_id)
+
+ def __len__(self):
+ return len(self.sample_id_list)
+
+ def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+ """Get data infos.
+
+ This method gets information from the raw data.
+
+ Args:
+ num_workers (int): Number of threads to be used. Default: 4.
+ has_label (bool): Whether the data has label. Default: True.
+ sample_id_list (list[int]): Index list of the sample.
+ Default: None.
+
+ Returns:
+ infos (list[dict]): Information of the raw data.
+ """
+
+ def process_single_scene(sample_idx):
+ print(f'{self.split} sample_idx: {sample_idx}')
+ info = dict()
+ pc_info = {
+ 'num_features': 6,
+ 'lidar_idx': f'{self.split}_{sample_idx}'
+ }
+ info['point_cloud'] = pc_info
+ pts_filename = osp.join(self.root_dir, 's3dis_data',
+ f'{self.split}_{sample_idx}_point.npy')
+ pts_instance_mask_path = osp.join(
+ self.root_dir, 's3dis_data',
+ f'{self.split}_{sample_idx}_ins_label.npy')
+ pts_semantic_mask_path = osp.join(
+ self.root_dir, 's3dis_data',
+ f'{self.split}_{sample_idx}_sem_label.npy')
+
+ points = np.load(pts_filename).astype(np.float32)
+ pts_instance_mask = np.load(pts_instance_mask_path).astype(np.int)
+ pts_semantic_mask = np.load(pts_semantic_mask_path).astype(np.int)
+
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+ points.tofile(
+ osp.join(self.root_dir, 'points',
+ f'{self.split}_{sample_idx}.bin'))
+ pts_instance_mask.tofile(
+ osp.join(self.root_dir, 'instance_mask',
+ f'{self.split}_{sample_idx}.bin'))
+ pts_semantic_mask.tofile(
+ osp.join(self.root_dir, 'semantic_mask',
+ f'{self.split}_{sample_idx}.bin'))
+
+ info['pts_path'] = osp.join('points',
+ f'{self.split}_{sample_idx}.bin')
+ info['pts_instance_mask_path'] = osp.join(
+ 'instance_mask', f'{self.split}_{sample_idx}.bin')
+ info['pts_semantic_mask_path'] = osp.join(
+ 'semantic_mask', f'{self.split}_{sample_idx}.bin')
+ info['annos'] = self.get_bboxes(points, pts_instance_mask,
+ pts_semantic_mask)
+
+ return info
+
+ sample_id_list = sample_id_list if sample_id_list is not None \
+ else self.sample_id_list
+ with futures.ThreadPoolExecutor(num_workers) as executor:
+ infos = executor.map(process_single_scene, sample_id_list)
+ return list(infos)
+
+ def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask):
+ """Convert instance masks to axis-aligned bounding boxes.
+
+ Args:
+ points (np.array): Scene points of shape (n, 6).
+ pts_instance_mask (np.ndarray): Instance labels of shape (n,).
+ pts_semantic_mask (np.ndarray): Semantic labels of shape (n,).
+
+ Returns:
+ dict: A dict containing detection infos with following keys:
+
+ - gt_boxes_upright_depth (np.ndarray): Bounding boxes
+ of shape (n, 6)
+ - class (np.ndarray): Box labels of shape (n,)
+ - gt_num (int): Number of boxes.
+ """
+ bboxes, labels = [], []
+ for i in range(1, pts_instance_mask.max()):
+ ids = pts_instance_mask == i
+ # check if all instance points have same semantic label
+ assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max()
+ label = pts_semantic_mask[ids][0]
+ # keep only furniture objects
+ if label in self.cat_ids2class:
+ labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]])
+ pts = points[:, :3][ids]
+ min_pts = pts.min(axis=0)
+ max_pts = pts.max(axis=0)
+ locations = (min_pts + max_pts) / 2
+ dimensions = max_pts - min_pts
+ bboxes.append(np.concatenate((locations, dimensions)))
+ annotation = dict()
+ # follow ScanNet and SUN RGB-D keys
+ annotation['gt_boxes_upright_depth'] = np.array(bboxes)
+ annotation['class'] = np.array(labels)
+ annotation['gt_num'] = len(labels)
+ return annotation
+
+
+class S3DISSegData(object):
+ """S3DIS dataset used to generate infos for semantic segmentation task.
+
+ Args:
+ data_root (str): Root path of the raw data.
+ ann_file (str): The generated scannet infos.
+ split (str): Set split type of the data. Default: 'train'.
+ num_points (int): Number of points in each data input. Default: 8192.
+ label_weight_func (function): Function to compute the label weight.
+ Default: None.
+ """
+
+ def __init__(self,
+ data_root,
+ ann_file,
+ split='Area_1',
+ num_points=4096,
+ label_weight_func=None):
+ self.data_root = data_root
+ self.data_infos = mmcv.load(ann_file)
+ self.split = split
+ self.num_points = num_points
+
+ self.all_ids = np.arange(13) # all possible ids
+ self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12]) # used for seg task
+ self.ignore_index = len(self.cat_ids)
+
+ self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+ self.ignore_index
+ for i, cat_id in enumerate(self.cat_ids):
+ self.cat_id2class[cat_id] = i
+
+ # label weighting function is taken from
+ # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+ self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+ label_weight_func is None else label_weight_func
+
+ def get_seg_infos(self):
+ scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+ save_folder = osp.join(self.data_root, 'seg_info')
+ mmcv.mkdir_or_exist(save_folder)
+ np.save(
+ osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+ scene_idxs)
+ np.save(
+ osp.join(save_folder, f'{self.split}_label_weight.npy'),
+ label_weight)
+ print(f'{self.split} resampled scene index and label weight saved')
+
+ def _convert_to_label(self, mask):
+ """Convert class_id in loaded segmentation mask to label."""
+ if isinstance(mask, str):
+ if mask.endswith('npy'):
+ mask = np.load(mask)
+ else:
+ mask = np.fromfile(mask, dtype=np.long)
+ label = self.cat_id2class[mask]
+ return label
+
+ def get_scene_idxs_and_label_weight(self):
+ """Compute scene_idxs for data sampling and label weight for loss \
+ calculation.
+
+ We sample more times for scenes with more points. Label_weight is
+ inversely proportional to number of class points.
+ """
+ num_classes = len(self.cat_ids)
+ num_point_all = []
+ label_weight = np.zeros((num_classes + 1, )) # ignore_index
+ for data_info in self.data_infos:
+ label = self._convert_to_label(
+ osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+ num_point_all.append(label.shape[0])
+ class_count, _ = np.histogram(label, range(num_classes + 2))
+ label_weight += class_count
+
+ # repeat scene_idx for num_scene_point // num_sample_point times
+ sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+ num_iter = int(np.sum(num_point_all) / float(self.num_points))
+ scene_idxs = []
+ for idx in range(len(self.data_infos)):
+ scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+ scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+ # calculate label weight, adopted from PointNet++
+ label_weight = label_weight[:-1].astype(np.float32)
+ label_weight = label_weight / label_weight.sum()
+ label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+ return scene_idxs, label_weight
diff --git a/adzoo/bevformer/data_converter/scannet_data_utils.py b/adzoo/bevformer/data_converter/scannet_data_utils.py
new file mode 100755
index 0000000..a437fe0
--- /dev/null
+++ b/adzoo/bevformer/data_converter/scannet_data_utils.py
@@ -0,0 +1,293 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class ScanNetData(object):
+ """ScanNet data.
+
+ Generate scannet infos for scannet_converter.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ split (str): Set split type of the data. Default: 'train'.
+ """
+
+ def __init__(self, root_path, split='train'):
+ self.root_dir = root_path
+ self.split = split
+ self.split_dir = osp.join(root_path)
+ self.classes = [
+ 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+ 'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+ 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+ 'garbagebin'
+ ]
+ self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+ self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+ self.cat_ids = np.array(
+ [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39])
+ self.cat_ids2class = {
+ nyu40id: i
+ for i, nyu40id in enumerate(list(self.cat_ids))
+ }
+ assert split in ['train', 'val', 'test']
+ split_file = osp.join(self.root_dir, 'meta_data',
+ f'scannetv2_{split}.txt')
+ mmcv.check_file_exist(split_file)
+ self.sample_id_list = mmcv.list_from_file(split_file)
+ self.test_mode = (split == 'test')
+
+ def __len__(self):
+ return len(self.sample_id_list)
+
+ def get_aligned_box_label(self, idx):
+ box_file = osp.join(self.root_dir, 'scannet_instance_data',
+ f'{idx}_aligned_bbox.npy')
+ mmcv.check_file_exist(box_file)
+ return np.load(box_file)
+
+ def get_unaligned_box_label(self, idx):
+ box_file = osp.join(self.root_dir, 'scannet_instance_data',
+ f'{idx}_unaligned_bbox.npy')
+ mmcv.check_file_exist(box_file)
+ return np.load(box_file)
+
+ def get_axis_align_matrix(self, idx):
+ matrix_file = osp.join(self.root_dir, 'scannet_instance_data',
+ f'{idx}_axis_align_matrix.npy')
+ mmcv.check_file_exist(matrix_file)
+ return np.load(matrix_file)
+
+ def get_images(self, idx):
+ paths = []
+ path = osp.join(self.root_dir, 'posed_images', idx)
+ for file in sorted(os.listdir(path)):
+ if file.endswith('.jpg'):
+ paths.append(osp.join('posed_images', idx, file))
+ return paths
+
+ def get_extrinsics(self, idx):
+ extrinsics = []
+ path = osp.join(self.root_dir, 'posed_images', idx)
+ for file in sorted(os.listdir(path)):
+ if file.endswith('.txt') and not file == 'intrinsic.txt':
+ extrinsics.append(np.loadtxt(osp.join(path, file)))
+ return extrinsics
+
+ def get_intrinsics(self, idx):
+ matrix_file = osp.join(self.root_dir, 'posed_images', idx,
+ 'intrinsic.txt')
+ mmcv.check_file_exist(matrix_file)
+ return np.loadtxt(matrix_file)
+
+ def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+ """Get data infos.
+
+ This method gets information from the raw data.
+
+ Args:
+ num_workers (int): Number of threads to be used. Default: 4.
+ has_label (bool): Whether the data has label. Default: True.
+ sample_id_list (list[int]): Index list of the sample.
+ Default: None.
+
+ Returns:
+ infos (list[dict]): Information of the raw data.
+ """
+
+ def process_single_scene(sample_idx):
+ print(f'{self.split} sample_idx: {sample_idx}')
+ info = dict()
+ pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+ info['point_cloud'] = pc_info
+ pts_filename = osp.join(self.root_dir, 'scannet_instance_data',
+ f'{sample_idx}_vert.npy')
+ points = np.load(pts_filename)
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+ points.tofile(
+ osp.join(self.root_dir, 'points', f'{sample_idx}.bin'))
+ info['pts_path'] = osp.join('points', f'{sample_idx}.bin')
+
+ # update with RGB image paths if exist
+ if os.path.exists(osp.join(self.root_dir, 'posed_images')):
+ info['intrinsics'] = self.get_intrinsics(sample_idx)
+ all_extrinsics = self.get_extrinsics(sample_idx)
+ all_img_paths = self.get_images(sample_idx)
+ # some poses in ScanNet are invalid
+ extrinsics, img_paths = [], []
+ for extrinsic, img_path in zip(all_extrinsics, all_img_paths):
+ if np.all(np.isfinite(extrinsic)):
+ img_paths.append(img_path)
+ extrinsics.append(extrinsic)
+ info['extrinsics'] = extrinsics
+ info['img_paths'] = img_paths
+
+ if not self.test_mode:
+ pts_instance_mask_path = osp.join(
+ self.root_dir, 'scannet_instance_data',
+ f'{sample_idx}_ins_label.npy')
+ pts_semantic_mask_path = osp.join(
+ self.root_dir, 'scannet_instance_data',
+ f'{sample_idx}_sem_label.npy')
+
+ pts_instance_mask = np.load(pts_instance_mask_path).astype(
+ np.long)
+ pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+ np.long)
+
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+ pts_instance_mask.tofile(
+ osp.join(self.root_dir, 'instance_mask',
+ f'{sample_idx}.bin'))
+ pts_semantic_mask.tofile(
+ osp.join(self.root_dir, 'semantic_mask',
+ f'{sample_idx}.bin'))
+
+ info['pts_instance_mask_path'] = osp.join(
+ 'instance_mask', f'{sample_idx}.bin')
+ info['pts_semantic_mask_path'] = osp.join(
+ 'semantic_mask', f'{sample_idx}.bin')
+
+ if has_label:
+ annotations = {}
+ # box is of shape [k, 6 + class]
+ aligned_box_label = self.get_aligned_box_label(sample_idx)
+ unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+ annotations['gt_num'] = aligned_box_label.shape[0]
+ if annotations['gt_num'] != 0:
+ aligned_box = aligned_box_label[:, :-1] # k, 6
+ unaligned_box = unaligned_box_label[:, :-1]
+ classes = aligned_box_label[:, -1] # k
+ annotations['name'] = np.array([
+ self.label2cat[self.cat_ids2class[classes[i]]]
+ for i in range(annotations['gt_num'])
+ ])
+ # default names are given to aligned bbox for compatibility
+ # we also save unaligned bbox info with marked names
+ annotations['location'] = aligned_box[:, :3]
+ annotations['dimensions'] = aligned_box[:, 3:6]
+ annotations['gt_boxes_upright_depth'] = aligned_box
+ annotations['unaligned_location'] = unaligned_box[:, :3]
+ annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+ annotations[
+ 'unaligned_gt_boxes_upright_depth'] = unaligned_box
+ annotations['index'] = np.arange(
+ annotations['gt_num'], dtype=np.int32)
+ annotations['class'] = np.array([
+ self.cat_ids2class[classes[i]]
+ for i in range(annotations['gt_num'])
+ ])
+ axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+ annotations['axis_align_matrix'] = axis_align_matrix # 4x4
+ info['annos'] = annotations
+ return info
+
+ sample_id_list = sample_id_list if sample_id_list is not None \
+ else self.sample_id_list
+ with futures.ThreadPoolExecutor(num_workers) as executor:
+ infos = executor.map(process_single_scene, sample_id_list)
+ return list(infos)
+
+
+class ScanNetSegData(object):
+ """ScanNet dataset used to generate infos for semantic segmentation task.
+
+ Args:
+ data_root (str): Root path of the raw data.
+ ann_file (str): The generated scannet infos.
+ split (str): Set split type of the data. Default: 'train'.
+ num_points (int): Number of points in each data input. Default: 8192.
+ label_weight_func (function): Function to compute the label weight.
+ Default: None.
+ """
+
+ def __init__(self,
+ data_root,
+ ann_file,
+ split='train',
+ num_points=8192,
+ label_weight_func=None):
+ self.data_root = data_root
+ self.data_infos = mmcv.load(ann_file)
+ self.split = split
+ assert split in ['train', 'val', 'test']
+ self.num_points = num_points
+
+ self.all_ids = np.arange(41) # all possible ids
+ self.cat_ids = np.array([
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36,
+ 39
+ ]) # used for seg task
+ self.ignore_index = len(self.cat_ids)
+
+ self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+ self.ignore_index
+ for i, cat_id in enumerate(self.cat_ids):
+ self.cat_id2class[cat_id] = i
+
+ # label weighting function is taken from
+ # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+ self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+ label_weight_func is None else label_weight_func
+
+ def get_seg_infos(self):
+ if self.split == 'test':
+ return
+ scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+ save_folder = osp.join(self.data_root, 'seg_info')
+ mmcv.mkdir_or_exist(save_folder)
+ np.save(
+ osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+ scene_idxs)
+ np.save(
+ osp.join(save_folder, f'{self.split}_label_weight.npy'),
+ label_weight)
+ print(f'{self.split} resampled scene index and label weight saved')
+
+ def _convert_to_label(self, mask):
+ """Convert class_id in loaded segmentation mask to label."""
+ if isinstance(mask, str):
+ if mask.endswith('npy'):
+ mask = np.load(mask)
+ else:
+ mask = np.fromfile(mask, dtype=np.long)
+ label = self.cat_id2class[mask]
+ return label
+
+ def get_scene_idxs_and_label_weight(self):
+ """Compute scene_idxs for data sampling and label weight for loss \
+ calculation.
+
+ We sample more times for scenes with more points. Label_weight is
+ inversely proportional to number of class points.
+ """
+ num_classes = len(self.cat_ids)
+ num_point_all = []
+ label_weight = np.zeros((num_classes + 1, )) # ignore_index
+ for data_info in self.data_infos:
+ label = self._convert_to_label(
+ osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+ num_point_all.append(label.shape[0])
+ class_count, _ = np.histogram(label, range(num_classes + 2))
+ label_weight += class_count
+
+ # repeat scene_idx for num_scene_point // num_sample_point times
+ sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+ num_iter = int(np.sum(num_point_all) / float(self.num_points))
+ scene_idxs = []
+ for idx in range(len(self.data_infos)):
+ scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+ scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+ # calculate label weight, adopted from PointNet++
+ label_weight = label_weight[:-1].astype(np.float32)
+ label_weight = label_weight / label_weight.sum()
+ label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+ return scene_idxs, label_weight
diff --git a/adzoo/bevformer/data_converter/sunrgbd_data_utils.py b/adzoo/bevformer/data_converter/sunrgbd_data_utils.py
new file mode 100755
index 0000000..9f8a502
--- /dev/null
+++ b/adzoo/bevformer/data_converter/sunrgbd_data_utils.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from concurrent import futures as futures
+from os import path as osp
+from scipy import io as sio
+
+
+def random_sampling(points, num_points, replace=None, return_choices=False):
+ """Random sampling.
+
+ Sampling point cloud to a certain number of points.
+
+ Args:
+ points (ndarray): Point cloud.
+ num_points (int): The number of samples.
+ replace (bool): Whether the sample is with or without replacement.
+ return_choices (bool): Whether to return choices.
+
+ Returns:
+ points (ndarray): Point cloud after sampling.
+ """
+
+ if replace is None:
+ replace = (points.shape[0] < num_points)
+ choices = np.random.choice(points.shape[0], num_points, replace=replace)
+ if return_choices:
+ return points[choices], choices
+ else:
+ return points[choices]
+
+
+class SUNRGBDInstance(object):
+
+ def __init__(self, line):
+ data = line.split(' ')
+ data[1:] = [float(x) for x in data[1:]]
+ self.classname = data[0]
+ self.xmin = data[1]
+ self.ymin = data[2]
+ self.xmax = data[1] + data[3]
+ self.ymax = data[2] + data[4]
+ self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax])
+ self.centroid = np.array([data[5], data[6], data[7]])
+ self.w = data[8]
+ self.l = data[9] # noqa: E741
+ self.h = data[10]
+ self.orientation = np.zeros((3, ))
+ self.orientation[0] = data[11]
+ self.orientation[1] = data[12]
+ self.heading_angle = -1 * np.arctan2(self.orientation[1],
+ self.orientation[0])
+ self.box3d = np.concatenate([
+ self.centroid,
+ np.array([self.l * 2, self.w * 2, self.h * 2, self.heading_angle])
+ ])
+
+
+class SUNRGBDData(object):
+ """SUNRGBD data.
+
+ Generate scannet infos for sunrgbd_converter.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ split (str): Set split type of the data. Default: 'train'.
+ use_v1 (bool): Whether to use v1. Default: False.
+ """
+
+ def __init__(self, root_path, split='train', use_v1=False):
+ self.root_dir = root_path
+ self.split = split
+ self.split_dir = osp.join(root_path, 'sunrgbd_trainval')
+ self.classes = [
+ 'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+ 'night_stand', 'bookshelf', 'bathtub'
+ ]
+ self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+ self.label2cat = {
+ label: self.classes[label]
+ for label in range(len(self.classes))
+ }
+ assert split in ['train', 'val', 'test']
+ split_file = osp.join(self.split_dir, f'{split}_data_idx.txt')
+ mmcv.check_file_exist(split_file)
+ self.sample_id_list = map(int, mmcv.list_from_file(split_file))
+ self.image_dir = osp.join(self.split_dir, 'image')
+ self.calib_dir = osp.join(self.split_dir, 'calib')
+ self.depth_dir = osp.join(self.split_dir, 'depth')
+ if use_v1:
+ self.label_dir = osp.join(self.split_dir, 'label_v1')
+ else:
+ self.label_dir = osp.join(self.split_dir, 'label')
+
+ def __len__(self):
+ return len(self.sample_id_list)
+
+ def get_image(self, idx):
+ img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg')
+ return mmcv.imread(img_filename)
+
+ def get_image_shape(self, idx):
+ image = self.get_image(idx)
+ return np.array(image.shape[:2], dtype=np.int32)
+
+ def get_depth(self, idx):
+ depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat')
+ depth = sio.loadmat(depth_filename)['instance']
+ return depth
+
+ def get_calibration(self, idx):
+ calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt')
+ lines = [line.rstrip() for line in open(calib_filepath)]
+ Rt = np.array([float(x) for x in lines[0].split(' ')])
+ Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32)
+ K = np.array([float(x) for x in lines[1].split(' ')])
+ K = np.reshape(K, (3, 3), order='F').astype(np.float32)
+ return K, Rt
+
+ def get_label_objects(self, idx):
+ label_filename = osp.join(self.label_dir, f'{idx:06d}.txt')
+ lines = [line.rstrip() for line in open(label_filename)]
+ objects = [SUNRGBDInstance(line) for line in lines]
+ return objects
+
+ def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+ """Get data infos.
+
+ This method gets information from the raw data.
+
+ Args:
+ num_workers (int): Number of threads to be used. Default: 4.
+ has_label (bool): Whether the data has label. Default: True.
+ sample_id_list (list[int]): Index list of the sample.
+ Default: None.
+
+ Returns:
+ infos (list[dict]): Information of the raw data.
+ """
+
+ def process_single_scene(sample_idx):
+ print(f'{self.split} sample_idx: {sample_idx}')
+ # convert depth to points
+ SAMPLE_NUM = 50000
+ # TODO: Check whether can move the point
+ # sampling process during training.
+ pc_upright_depth = self.get_depth(sample_idx)
+ pc_upright_depth_subsampled = random_sampling(
+ pc_upright_depth, SAMPLE_NUM)
+
+ info = dict()
+ pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+ info['point_cloud'] = pc_info
+
+ mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+ pc_upright_depth_subsampled.tofile(
+ osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin'))
+
+ info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin')
+ img_path = osp.join('image', f'{sample_idx:06d}.jpg')
+ image_info = {
+ 'image_idx': sample_idx,
+ 'image_shape': self.get_image_shape(sample_idx),
+ 'image_path': img_path
+ }
+ info['image'] = image_info
+
+ K, Rt = self.get_calibration(sample_idx)
+ calib_info = {'K': K, 'Rt': Rt}
+ info['calib'] = calib_info
+
+ if has_label:
+ obj_list = self.get_label_objects(sample_idx)
+ annotations = {}
+ annotations['gt_num'] = len([
+ obj.classname for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ])
+ if annotations['gt_num'] != 0:
+ annotations['name'] = np.array([
+ obj.classname for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ])
+ annotations['bbox'] = np.concatenate([
+ obj.box2d.reshape(1, 4) for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ],
+ axis=0)
+ annotations['location'] = np.concatenate([
+ obj.centroid.reshape(1, 3) for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ],
+ axis=0)
+ annotations['dimensions'] = 2 * np.array([
+ [obj.l, obj.w, obj.h] for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ]) # lwh (depth) format
+ annotations['rotation_y'] = np.array([
+ obj.heading_angle for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ])
+ annotations['index'] = np.arange(
+ len(obj_list), dtype=np.int32)
+ annotations['class'] = np.array([
+ self.cat2label[obj.classname] for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ])
+ annotations['gt_boxes_upright_depth'] = np.stack(
+ [
+ obj.box3d for obj in obj_list
+ if obj.classname in self.cat2label.keys()
+ ],
+ axis=0) # (K,8)
+ info['annos'] = annotations
+ return info
+
+ sample_id_list = sample_id_list if \
+ sample_id_list is not None else self.sample_id_list
+ with futures.ThreadPoolExecutor(num_workers) as executor:
+ infos = executor.map(process_single_scene, sample_id_list)
+ return list(infos)
diff --git a/adzoo/bevformer/data_converter/waymo_converter.py b/adzoo/bevformer/data_converter/waymo_converter.py
new file mode 100755
index 0000000..94fcae1
--- /dev/null
+++ b/adzoo/bevformer/data_converter/waymo_converter.py
@@ -0,0 +1,519 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+ `_.
+"""
+
+try:
+ from waymo_open_dataset import dataset_pb2
+except ImportError:
+ raise ImportError(
+ 'Please run "pip install waymo-open-dataset-tf-2-2-0==1.2.0" '
+ 'to install the official devkit first.')
+
+import mmcv
+import numpy as np
+import tensorflow as tf
+from glob import glob
+from os.path import join
+from waymo_open_dataset.utils import range_image_utils, transform_utils
+from waymo_open_dataset.utils.frame_utils import \
+ parse_range_image_and_camera_projection
+
+
+class Waymo2KITTI(object):
+ """Waymo to KITTI converter.
+
+ This class serves as the converter to change the waymo raw data to KITTI
+ format.
+
+ Args:
+ load_dir (str): Directory to load waymo raw data.
+ save_dir (str): Directory to save data in KITTI format.
+ prefix (str): Prefix of filename. In general, 0 for training, 1 for
+ validation and 2 for testing.
+ workers (str): Number of workers for the parallel process.
+ test_mode (bool): Whether in the test_mode. Default: False.
+ """
+
+ def __init__(self,
+ load_dir,
+ save_dir,
+ prefix,
+ workers=64,
+ test_mode=False):
+ self.filter_empty_3dboxes = True
+ self.filter_no_label_zone_points = True
+
+ self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+
+ # Only data collected in specific locations will be converted
+ # If set None, this filter is disabled
+ # Available options: location_sf (main dataset)
+ self.selected_waymo_locations = None
+ self.save_track_id = False
+
+ # turn on eager execution for older tensorflow versions
+ if int(tf.__version__.split('.')[0]) < 2:
+ tf.enable_eager_execution()
+
+ self.lidar_list = [
+ '_FRONT', '_FRONT_RIGHT', '_FRONT_LEFT', '_SIDE_RIGHT',
+ '_SIDE_LEFT'
+ ]
+ self.type_list = [
+ 'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
+ ]
+ self.waymo_to_kitti_class_map = {
+ 'UNKNOWN': 'DontCare',
+ 'PEDESTRIAN': 'Pedestrian',
+ 'VEHICLE': 'Car',
+ 'CYCLIST': 'Cyclist',
+ 'SIGN': 'Sign' # not in kitti
+ }
+
+ self.load_dir = load_dir
+ self.save_dir = save_dir
+ self.prefix = prefix
+ self.workers = int(workers)
+ self.test_mode = test_mode
+
+ self.tfrecord_pathnames = sorted(
+ glob(join(self.load_dir, '*.tfrecord')))
+
+ self.label_save_dir = f'{self.save_dir}/label_'
+ self.label_all_save_dir = f'{self.save_dir}/label_all'
+ self.image_save_dir = f'{self.save_dir}/image_'
+ self.calib_save_dir = f'{self.save_dir}/calib'
+ self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
+ self.pose_save_dir = f'{self.save_dir}/pose'
+
+ self.create_folder()
+
+ def convert(self):
+ """Convert action."""
+ print('Start converting ...')
+ mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+ self.workers)
+ print('\nFinished ...')
+
+ def convert_one(self, file_idx):
+ """Convert action for single file.
+
+ Args:
+ file_idx (int): Index of the file to be converted.
+ """
+ pathname = self.tfrecord_pathnames[file_idx]
+ dataset = tf.data.TFRecordDataset(pathname, compression_type='')
+
+ for frame_idx, data in enumerate(dataset):
+
+ if frame_idx % 5 != 0:
+ continue
+ # print(frame_idx)
+ frame = dataset_pb2.Frame()
+ frame.ParseFromString(bytearray(data.numpy()))
+ if (self.selected_waymo_locations is not None
+ and frame.context.stats.location
+ not in self.selected_waymo_locations):
+ continue
+
+ self.save_image(frame, file_idx, frame_idx)
+ self.save_calib(frame, file_idx, frame_idx)
+ self.save_lidar(frame, file_idx, frame_idx)
+ self.save_pose(frame, file_idx, frame_idx)
+
+ if not self.test_mode:
+ self.save_label(frame, file_idx, frame_idx)
+
+ def __len__(self):
+ """Length of the filename list."""
+ return len(self.tfrecord_pathnames)
+
+ def save_image(self, frame, file_idx, frame_idx):
+ """Parse and save the images in png format.
+
+ Args:
+ frame (:obj:`Frame`): Open dataset frame proto.
+ file_idx (int): Current file index.
+ frame_idx (int): Current frame index.
+ """
+ for img in frame.images:
+ img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \
+ f'{self.prefix}{str(file_idx).zfill(3)}' + \
+ f'{str(frame_idx).zfill(3)}.png'
+ img = mmcv.imfrombytes(img.image)
+ mmcv.imwrite(img, img_path)
+
+ def save_calib(self, frame, file_idx, frame_idx):
+ """Parse and save the calibration data.
+
+ Args:
+ frame (:obj:`Frame`): Open dataset frame proto.
+ file_idx (int): Current file index.
+ frame_idx (int): Current frame index.
+ """
+ # waymo front camera to kitti reference camera
+ T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+ [1.0, 0.0, 0.0]])
+ camera_calibs = []
+ R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
+ Tr_velo_to_cams = []
+ calib_context = ''
+
+ for camera in frame.context.camera_calibrations:
+ # extrinsic parameters
+ T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+ 4, 4)
+ T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+ Tr_velo_to_cam = \
+ self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+ if camera.name == 1: # FRONT = 1, see dataset.proto for details
+ self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
+ Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
+ Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
+
+ # intrinsic parameters
+ camera_calib = np.zeros((3, 4))
+ camera_calib[0, 0] = camera.intrinsic[0]
+ camera_calib[1, 1] = camera.intrinsic[1]
+ camera_calib[0, 2] = camera.intrinsic[2]
+ camera_calib[1, 2] = camera.intrinsic[3]
+ camera_calib[2, 2] = 1
+ camera_calib = list(camera_calib.reshape(12))
+ camera_calib = [f'{i:e}' for i in camera_calib]
+ camera_calibs.append(camera_calib)
+
+ # all camera ids are saved as id-1 in the result because
+ # camera 0 is unknown in the proto
+ for i in range(5):
+ calib_context += 'P' + str(i) + ': ' + \
+ ' '.join(camera_calibs[i]) + '\n'
+ calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
+ for i in range(5):
+ calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
+ ' '.join(Tr_velo_to_cams[i]) + '\n'
+
+ with open(
+ f'{self.calib_save_dir}/{self.prefix}' +
+ f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
+ 'w+') as fp_calib:
+ fp_calib.write(calib_context)
+ fp_calib.close()
+
+ def save_lidar(self, frame, file_idx, frame_idx):
+ """Parse and save the lidar data in psd format.
+
+ Args:
+ frame (:obj:`Frame`): Open dataset frame proto.
+ file_idx (int): Current file index.
+ frame_idx (int): Current frame index.
+ """
+ range_images, camera_projections, range_image_top_pose = \
+ parse_range_image_and_camera_projection(frame)
+
+ # First return
+ points_0, cp_points_0, intensity_0, elongation_0 = \
+ self.convert_range_image_to_point_cloud(
+ frame,
+ range_images,
+ camera_projections,
+ range_image_top_pose,
+ ri_index=0
+ )
+ points_0 = np.concatenate(points_0, axis=0)
+ intensity_0 = np.concatenate(intensity_0, axis=0)
+ elongation_0 = np.concatenate(elongation_0, axis=0)
+
+ # Second return
+ points_1, cp_points_1, intensity_1, elongation_1 = \
+ self.convert_range_image_to_point_cloud(
+ frame,
+ range_images,
+ camera_projections,
+ range_image_top_pose,
+ ri_index=1
+ )
+ points_1 = np.concatenate(points_1, axis=0)
+ intensity_1 = np.concatenate(intensity_1, axis=0)
+ elongation_1 = np.concatenate(elongation_1, axis=0)
+
+ points = np.concatenate([points_0, points_1], axis=0)
+ intensity = np.concatenate([intensity_0, intensity_1], axis=0)
+ elongation = np.concatenate([elongation_0, elongation_1], axis=0)
+ timestamp = frame.timestamp_micros * np.ones_like(intensity)
+
+ # concatenate x,y,z, intensity, elongation, timestamp (6-dim)
+ point_cloud = np.column_stack(
+ (points, intensity, elongation, timestamp))
+
+ pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \
+ f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
+ point_cloud.astype(np.float32).tofile(pc_path)
+
+ def save_label(self, frame, file_idx, frame_idx):
+ """Parse and save the label data in txt format.
+ The relation between waymo and kitti coordinates is noteworthy:
+ 1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
+ 2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
+ 3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
+ 4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
+
+ Args:
+ frame (:obj:`Frame`): Open dataset frame proto.
+ file_idx (int): Current file index.
+ frame_idx (int): Current frame index.
+ """
+ fp_label_all = open(
+ f'{self.label_all_save_dir}/{self.prefix}' +
+ f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+')
+ id_to_bbox = dict()
+ id_to_name = dict()
+ for labels in frame.projected_lidar_labels:
+ name = labels.name
+ for label in labels.labels:
+ # TODO: need a workaround as bbox may not belong to front cam
+ bbox = [
+ label.box.center_x - label.box.length / 2,
+ label.box.center_y - label.box.width / 2,
+ label.box.center_x + label.box.length / 2,
+ label.box.center_y + label.box.width / 2
+ ]
+ id_to_bbox[label.id] = bbox
+ id_to_name[label.id] = name - 1
+
+ for obj in frame.laser_labels:
+ bounding_box = None
+ name = None
+ id = obj.id
+ for lidar in self.lidar_list:
+ if id + lidar in id_to_bbox:
+ bounding_box = id_to_bbox.get(id + lidar)
+ name = str(id_to_name.get(id + lidar))
+ break
+
+ if bounding_box is None or name is None:
+ name = '0'
+ bounding_box = (0, 0, 0, 0)
+
+ my_type = self.type_list[obj.type]
+
+ if my_type not in self.selected_waymo_classes:
+ continue
+
+ if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+ continue
+
+ my_type = self.waymo_to_kitti_class_map[my_type]
+
+ height = obj.box.height
+ width = obj.box.width
+ length = obj.box.length
+
+ x = obj.box.center_x
+ y = obj.box.center_y
+ z = obj.box.center_z - height / 2
+
+ # project bounding box to the virtual reference frame
+ pt_ref = self.T_velo_to_front_cam @ \
+ np.array([x, y, z, 1]).reshape((4, 1))
+ x, y, z, _ = pt_ref.flatten().tolist()
+
+ rotation_y = -obj.box.heading - np.pi / 2
+ track_id = obj.id
+
+ # not available
+ truncated = 0
+ occluded = 0
+ alpha = -10
+
+ line = my_type + \
+ ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
+ round(truncated, 2), occluded, round(alpha, 2),
+ round(bounding_box[0], 2), round(bounding_box[1], 2),
+ round(bounding_box[2], 2), round(bounding_box[3], 2),
+ round(height, 2), round(width, 2), round(length, 2),
+ round(x, 2), round(y, 2), round(z, 2),
+ round(rotation_y, 2))
+
+ if self.save_track_id:
+ line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
+ else:
+ line_all = line[:-1] + ' ' + name + '\n'
+
+ fp_label = open(
+ f'{self.label_save_dir}{name}/{self.prefix}' +
+ f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'a')
+ fp_label.write(line)
+ fp_label.close()
+
+ fp_label_all.write(line_all)
+
+ fp_label_all.close()
+
+ def save_pose(self, frame, file_idx, frame_idx):
+ """Parse and save the pose data.
+
+ Note that SDC's own pose is not included in the regular training
+ of KITTI dataset. KITTI raw dataset contains ego motion files
+ but are not often used. Pose is important for algorithms that
+ take advantage of the temporal information.
+
+ Args:
+ frame (:obj:`Frame`): Open dataset frame proto.
+ file_idx (int): Current file index.
+ frame_idx (int): Current frame index.
+ """
+ pose = np.array(frame.pose.transform).reshape(4, 4)
+ np.savetxt(
+ join(f'{self.pose_save_dir}/{self.prefix}' +
+ f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
+ pose)
+
+ def create_folder(self):
+ """Create folder for data preprocessing."""
+ if not self.test_mode:
+ dir_list1 = [
+ self.label_all_save_dir, self.calib_save_dir,
+ self.point_cloud_save_dir, self.pose_save_dir
+ ]
+ dir_list2 = [self.label_save_dir, self.image_save_dir]
+ else:
+ dir_list1 = [
+ self.calib_save_dir, self.point_cloud_save_dir,
+ self.pose_save_dir
+ ]
+ dir_list2 = [self.image_save_dir]
+ for d in dir_list1:
+ mmcv.mkdir_or_exist(d)
+ for d in dir_list2:
+ for i in range(5):
+ mmcv.mkdir_or_exist(f'{d}{str(i)}')
+
+ def convert_range_image_to_point_cloud(self,
+ frame,
+ range_images,
+ camera_projections,
+ range_image_top_pose,
+ ri_index=0):
+ """Convert range images to point cloud.
+
+ Args:
+ frame (:obj:`Frame`): Open dataset frame.
+ range_images (dict): Mapping from laser_name to list of two
+ range images corresponding with two returns.
+ camera_projections (dict): Mapping from laser_name to list of two
+ camera projections corresponding with two returns.
+ range_image_top_pose (:obj:`Transform`): Range image pixel pose for
+ top lidar.
+ ri_index (int): 0 for the first return, 1 for the second return.
+ Default: 0.
+
+ Returns:
+ tuple[list[np.ndarray]]: (List of points with shape [N, 3],
+ camera projections of points with shape [N, 6], intensity
+ with shape [N, 1], elongation with shape [N, 1]). All the
+ lists have the length of lidar numbers (5).
+ """
+ calibrations = sorted(
+ frame.context.laser_calibrations, key=lambda c: c.name)
+ points = []
+ cp_points = []
+ intensity = []
+ elongation = []
+
+ frame_pose = tf.convert_to_tensor(
+ value=np.reshape(np.array(frame.pose.transform), [4, 4]))
+ # [H, W, 6]
+ range_image_top_pose_tensor = tf.reshape(
+ tf.convert_to_tensor(value=range_image_top_pose.data),
+ range_image_top_pose.shape.dims)
+ # [H, W, 3, 3]
+ range_image_top_pose_tensor_rotation = \
+ transform_utils.get_rotation_matrix(
+ range_image_top_pose_tensor[..., 0],
+ range_image_top_pose_tensor[..., 1],
+ range_image_top_pose_tensor[..., 2])
+ range_image_top_pose_tensor_translation = \
+ range_image_top_pose_tensor[..., 3:]
+ range_image_top_pose_tensor = transform_utils.get_transform(
+ range_image_top_pose_tensor_rotation,
+ range_image_top_pose_tensor_translation)
+ for c in calibrations:
+ range_image = range_images[c.name][ri_index]
+ if len(c.beam_inclinations) == 0:
+ beam_inclinations = range_image_utils.compute_inclination(
+ tf.constant(
+ [c.beam_inclination_min, c.beam_inclination_max]),
+ height=range_image.shape.dims[0])
+ else:
+ beam_inclinations = tf.constant(c.beam_inclinations)
+
+ beam_inclinations = tf.reverse(beam_inclinations, axis=[-1])
+ extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4])
+
+ range_image_tensor = tf.reshape(
+ tf.convert_to_tensor(value=range_image.data),
+ range_image.shape.dims)
+ pixel_pose_local = None
+ frame_pose_local = None
+ if c.name == dataset_pb2.LaserName.TOP:
+ pixel_pose_local = range_image_top_pose_tensor
+ pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0)
+ frame_pose_local = tf.expand_dims(frame_pose, axis=0)
+ range_image_mask = range_image_tensor[..., 0] > 0
+
+ if self.filter_no_label_zone_points:
+ nlz_mask = range_image_tensor[..., 3] != 1.0 # 1.0: in NLZ
+ range_image_mask = range_image_mask & nlz_mask
+
+ range_image_cartesian = \
+ range_image_utils.extract_point_cloud_from_range_image(
+ tf.expand_dims(range_image_tensor[..., 0], axis=0),
+ tf.expand_dims(extrinsic, axis=0),
+ tf.expand_dims(tf.convert_to_tensor(
+ value=beam_inclinations), axis=0),
+ pixel_pose=pixel_pose_local,
+ frame_pose=frame_pose_local)
+
+ range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0)
+ points_tensor = tf.gather_nd(range_image_cartesian,
+ tf.compat.v1.where(range_image_mask))
+
+ cp = camera_projections[c.name][ri_index]
+ cp_tensor = tf.reshape(
+ tf.convert_to_tensor(value=cp.data), cp.shape.dims)
+ cp_points_tensor = tf.gather_nd(
+ cp_tensor, tf.compat.v1.where(range_image_mask))
+ points.append(points_tensor.numpy())
+ cp_points.append(cp_points_tensor.numpy())
+
+ intensity_tensor = tf.gather_nd(range_image_tensor[..., 1],
+ tf.where(range_image_mask))
+ intensity.append(intensity_tensor.numpy())
+
+ elongation_tensor = tf.gather_nd(range_image_tensor[..., 2],
+ tf.where(range_image_mask))
+ elongation.append(elongation_tensor.numpy())
+
+ return points, cp_points, intensity, elongation
+
+ def cart_to_homo(self, mat):
+ """Convert transformation matrix in Cartesian coordinates to
+ homogeneous format.
+
+ Args:
+ mat (np.ndarray): Transformation matrix in Cartesian.
+ The input matrix shape is 3x3 or 3x4.
+
+ Returns:
+ np.ndarray: Transformation matrix in homogeneous format.
+ The matrix shape is 4x4.
+ """
+ ret = np.eye(4)
+ if mat.shape == (3, 3):
+ ret[:3, :3] = mat
+ elif mat.shape == (3, 4):
+ ret[:3, :] = mat
+ else:
+ raise ValueError(mat.shape)
+ return ret
diff --git a/adzoo/bevformer/dist_test.sh b/adzoo/bevformer/dist_test.sh
new file mode 100755
index 0000000..8b19a04
--- /dev/null
+++ b/adzoo/bevformer/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29203}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
diff --git a/adzoo/bevformer/dist_train.sh b/adzoo/bevformer/dist_train.sh
new file mode 100755
index 0000000..84d7fd7
--- /dev/null
+++ b/adzoo/bevformer/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-38912}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
diff --git a/adzoo/bevformer/fp16/dist_train.sh b/adzoo/bevformer/fp16/dist_train.sh
new file mode 100755
index 0000000..4ac9a15
--- /dev/null
+++ b/adzoo/bevformer/fp16/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28508}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/adzoo/bevformer/fp16/train.py b/adzoo/bevformer/fp16/train.py
new file mode 100644
index 0000000..eddc349
--- /dev/null
+++ b/adzoo/bevformer/fp16/train.py
@@ -0,0 +1,271 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet3d import __version__ as mmdet3d_version
+#from mmdet3d.apis import train_model
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from mmseg import __version__ as mmseg_version
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a detector')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--work-dir', help='the dir to save logs and models')
+ parser.add_argument(
+ '--resume-from', help='the checkpoint file to resume from')
+ parser.add_argument(
+ '--no-validate',
+ action='store_true',
+ help='whether not to evaluate the checkpoint during training')
+ group_gpus = parser.add_mutually_exclusive_group()
+ group_gpus.add_argument(
+ '--gpus',
+ type=int,
+ help='number of gpus to use '
+ '(only applicable to non-distributed training)')
+ group_gpus.add_argument(
+ '--gpu-ids',
+ type=int,
+ nargs='+',
+ help='ids of gpus to use '
+ '(only applicable to non-distributed training)')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file (deprecate), '
+ 'change to --cfg-options instead.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local_rank', type=int, default=0)
+ parser.add_argument(
+ '--autoscale-lr',
+ action='store_true',
+ help='automatically scale lr with the number of gpus')
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ if args.options and args.cfg_options:
+ raise ValueError(
+ '--options and --cfg-options cannot be both specified, '
+ '--options is deprecated in favor of --cfg-options')
+ if args.options:
+ warnings.warn('--options is deprecated in favor of --cfg-options')
+ args.cfg_options = args.options
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+
+ # import modules from plguin/xx, registry will be updated
+ if hasattr(cfg, 'plugin'):
+ if cfg.plugin:
+ import importlib
+ if hasattr(cfg, 'plugin_dir'):
+ plugin_dir = cfg.plugin_dir
+ _module_dir = os.path.dirname(plugin_dir)
+ _module_dir = _module_dir.split('/')
+ _module_path = _module_dir[0]
+
+ for m in _module_dir[1:]:
+ _module_path = _module_path + '.' + m
+ print(_module_path)
+ plg_lib = importlib.import_module(_module_path)
+ else:
+ # import dir is the dirpath for the config file
+ _module_dir = os.path.dirname(args.config)
+ _module_dir = _module_dir.split('/')
+ _module_path = _module_dir[0]
+ for m in _module_dir[1:]:
+ _module_path = _module_path + '.' + m
+ print(_module_path)
+ plg_lib = importlib.import_module(_module_path)
+
+ from projects.mmdet3d_plugin.bevformer.apis import custom_train_model
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+
+ # work_dir is determined in this priority: CLI > segment in file > filename
+ if args.work_dir is not None:
+ # update configs according to CLI args if args.work_dir is not None
+ cfg.work_dir = args.work_dir
+ elif cfg.get('work_dir', None) is None:
+ # use config filename as default work_dir if cfg.work_dir is None
+ cfg.work_dir = osp.join('./work_dirs',
+ osp.splitext(osp.basename(args.config))[0])
+ #if args.resume_from is not None:
+
+ if args.resume_from is not None and osp.isfile(args.resume_from):
+ cfg.resume_from = args.resume_from
+
+ if args.gpu_ids is not None:
+ cfg.gpu_ids = args.gpu_ids
+ else:
+ cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+ if digit_version(TORCH_VERSION) != digit_version('1.8.1'):
+ cfg.optimizer['type'] = 'AdamW'
+ if args.autoscale_lr:
+ # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+ cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ assert False, 'DOT NOT SUPPORT!!!'
+ distributed = False
+ else:
+ distributed = True
+ init_dist(args.launcher, **cfg.dist_params)
+ # re-set gpu_ids with distributed training mode
+ _, world_size = get_dist_info()
+ cfg.gpu_ids = range(world_size)
+
+ # create work_dir
+ mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+ # dump config
+ cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+ # init the logger before other steps
+ timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+ log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+ # specify logger name, if we still use 'mmdet', the output info will be
+ # filtered and won't be saved in the log_file
+ # TODO: ugly workaround to judge whether we are training det or seg model
+ if cfg.model.type in ['EncoderDecoder3D']:
+ logger_name = 'mmseg'
+ else:
+ logger_name = 'mmdet'
+ logger = get_root_logger(
+ log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+ # init the meta dict to record some important information such as
+ # environment info and seed, which will be logged
+ meta = dict()
+ # log env info
+ env_info_dict = collect_env()
+ env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+ dash_line = '-' * 60 + '\n'
+ logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+ dash_line)
+ meta['env_info'] = env_info
+ meta['config'] = cfg.pretty_text
+
+ # log some basic info
+ logger.info(f'Distributed training: {distributed}')
+ logger.info(f'Config:\n{cfg.pretty_text}')
+
+ # set random seeds
+ if args.seed is not None:
+ logger.info(f'Set random seed to {args.seed}, '
+ f'deterministic: {args.deterministic}')
+ set_random_seed(args.seed, deterministic=args.deterministic)
+ cfg.seed = args.seed
+ meta['seed'] = args.seed
+ meta['exp_name'] = osp.basename(args.config)
+
+ model = build_model(
+ cfg.model,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+ model.init_weights()
+
+ eval_model_config = copy.deepcopy(cfg.model)
+ eval_model = build_model(
+ eval_model_config,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(eval_model)
+
+ #eval_model.init_weights()
+ eval_model.load_state_dict(model.state_dict())
+
+ logger.info(f'Model:\n{model}')
+ from projects.mmdet3d_plugin.datasets import custom_build_dataset
+ datasets = [custom_build_dataset(cfg.data.train)]
+ if len(cfg.workflow) == 2:
+ val_dataset = copy.deepcopy(cfg.data.val)
+ # in case we use a dataset wrapper
+ if 'dataset' in cfg.data.train:
+ val_dataset.pipeline = cfg.data.train.dataset.pipeline
+ else:
+ val_dataset.pipeline = cfg.data.train.pipeline
+ # set test_mode=False here in deep copied config
+ # which do not affect AP/AR calculation later
+ # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa
+ val_dataset.test_mode = False
+ datasets.append(custom_build_dataset(val_dataset))
+ if cfg.checkpoint_config is not None:
+ # save mmdet version, config file content and class names in
+ # checkpoints as meta data
+ cfg.checkpoint_config.meta = dict(
+ mmdet_version=mmdet_version,
+ mmseg_version=mmseg_version,
+ mmdet3d_version=mmdet3d_version,
+ config=cfg.pretty_text,
+ CLASSES=datasets[0].CLASSES,
+ PALETTE=datasets[0].PALETTE # for segmentors
+ if hasattr(datasets[0], 'PALETTE') else None)
+ # add an attribute for visualization convenience
+ model.CLASSES = datasets[0].CLASSES
+ custom_train_model(
+ model,
+ datasets,
+ cfg,
+ eval_model=eval_model,
+ distributed=distributed,
+ validate=(not args.no_validate),
+ timestamp=timestamp,
+ meta=meta)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/misc/browse_dataset.py b/adzoo/bevformer/misc/browse_dataset.py
new file mode 100755
index 0000000..e3419f6
--- /dev/null
+++ b/adzoo/bevformer/misc/browse_dataset.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+ DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+ show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Browse a dataset')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument(
+ '--skip-type',
+ type=str,
+ nargs='+',
+ default=['Normalize'],
+ help='skip some useless pipeline')
+ parser.add_argument(
+ '--output-dir',
+ default=None,
+ type=str,
+ help='If there is no display interface, you can save it')
+ parser.add_argument(
+ '--task',
+ type=str,
+ choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+ help='Determine the visualization method depending on the task.')
+ parser.add_argument(
+ '--online',
+ action='store_true',
+ help='Whether to perform online visualization. Note that you often '
+ 'need a monitor to do so.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ args = parser.parse_args()
+ return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+ """Build data config for loading visualization data."""
+ cfg = Config.fromfile(config_path)
+ if cfg_options is not None:
+ cfg.merge_from_dict(cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+ # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+ # so we don't need to worry about it later
+ if cfg.data.train['type'] == 'RepeatDataset':
+ cfg.data.train = cfg.data.train.dataset
+ # use only first dataset for `ConcatDataset`
+ if cfg.data.train['type'] == 'ConcatDataset':
+ cfg.data.train = cfg.data.train.datasets[0]
+ train_data_cfg = cfg.data.train
+ # eval_pipeline purely consists of loading functions
+ # use eval_pipeline for data loading
+ train_data_cfg['pipeline'] = [
+ x for x in cfg.eval_pipeline if x['type'] not in skip_type
+ ]
+
+ return cfg
+
+
+def to_depth_mode(points, bboxes):
+ """Convert points and bboxes to Depth Coord and Depth Box mode."""
+ if points is not None:
+ points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+ Coord3DMode.DEPTH)
+ if bboxes is not None:
+ bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+ Box3DMode.DEPTH)
+ return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+ """Visualize 3D point cloud and 3D bboxes."""
+ example = dataset.prepare_train_data(idx)
+ points = example['points']._data.numpy()
+ gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+ if dataset.box_mode_3d != Box3DMode.DEPTH:
+ points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+ show_result(
+ points,
+ gt_bboxes.clone(),
+ None,
+ out_dir,
+ filename,
+ show=show,
+ snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+ """Visualize 3D point cloud and segmentation mask."""
+ example = dataset.prepare_train_data(idx)
+ points = example['points']._data.numpy()
+ gt_seg = example['pts_semantic_mask']._data.numpy()
+ show_seg_result(
+ points,
+ gt_seg.copy(),
+ None,
+ out_dir,
+ filename,
+ np.array(dataset.PALETTE),
+ dataset.ignore_index,
+ show=show,
+ snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+ dataset,
+ out_dir,
+ filename,
+ show=False,
+ is_nus_mono=False):
+ """Visualize 3D bboxes on 2D image by projection."""
+ try:
+ example = dataset.prepare_train_data(idx)
+ except AttributeError: # for Mono-3D datasets
+ example = dataset.prepare_train_img(idx)
+ gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+ img_metas = example['img_metas']._data
+ img = example['img']._data.numpy()
+ # need to transpose channel to first dim
+ img = img.transpose(1, 2, 0)
+ # no 3D gt bboxes, just show img
+ if gt_bboxes.tensor.shape[0] == 0:
+ gt_bboxes = None
+ if isinstance(gt_bboxes, DepthInstance3DBoxes):
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ None,
+ None,
+ out_dir,
+ filename,
+ box_mode='depth',
+ img_metas=img_metas,
+ show=show)
+ elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ None,
+ img_metas['lidar2img'],
+ out_dir,
+ filename,
+ box_mode='lidar',
+ img_metas=img_metas,
+ show=show)
+ elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ None,
+ img_metas['cam2img'],
+ out_dir,
+ filename,
+ box_mode='camera',
+ img_metas=img_metas,
+ show=show)
+ else:
+ # can't project, just show img
+ warnings.warn(
+ f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+ show_multi_modality_result(
+ img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+ args = parse_args()
+
+ if args.output_dir is not None:
+ mkdir_or_exist(args.output_dir)
+
+ cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+ try:
+ dataset = build_dataset(
+ cfg.data.train, default_args=dict(filter_empty_gt=False))
+ except TypeError: # seg dataset doesn't have `filter_empty_gt` key
+ dataset = build_dataset(cfg.data.train)
+ data_infos = dataset.data_infos
+ dataset_type = cfg.dataset_type
+
+ # configure visualization mode
+ vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+ for idx, data_info in enumerate(track_iter_progress(data_infos)):
+ if dataset_type in ['KittiDataset', 'WaymoDataset']:
+ data_path = data_info['point_cloud']['velodyne_path']
+ elif dataset_type in [
+ 'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+ 'S3DISSegDataset', 'S3DISDataset'
+ ]:
+ data_path = data_info['pts_path']
+ elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+ data_path = data_info['lidar_path']
+ elif dataset_type in ['NuScenesMonoDataset']:
+ data_path = data_info['file_name']
+ else:
+ raise NotImplementedError(
+ f'unsupported dataset type {dataset_type}')
+
+ file_name = osp.splitext(osp.basename(data_path))[0]
+
+ if vis_task in ['det', 'multi_modality-det']:
+ # show 3D bboxes on 3D point clouds
+ show_det_data(
+ idx, dataset, args.output_dir, file_name, show=args.online)
+ if vis_task in ['multi_modality-det', 'mono-det']:
+ # project 3D bboxes to 2D image
+ show_proj_bbox_img(
+ idx,
+ dataset,
+ args.output_dir,
+ file_name,
+ show=args.online,
+ is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+ elif vis_task in ['seg']:
+ # show 3D segmentation mask on 3D point clouds
+ show_seg_data(
+ idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/misc/print_config.py b/adzoo/bevformer/misc/print_config.py
new file mode 100755
index 0000000..3100fc3
--- /dev/null
+++ b/adzoo/bevformer/misc/print_config.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Print the whole config')
+ parser.add_argument('config', help='config file path')
+ parser.add_argument(
+ '--options', nargs='+', action=DictAction, help='arguments in dict')
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ if args.options is not None:
+ cfg.merge_from_dict(args.options)
+ print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/misc/visualize_results.py b/adzoo/bevformer/misc/visualize_results.py
new file mode 100755
index 0000000..302adc5
--- /dev/null
+++ b/adzoo/bevformer/misc/visualize_results.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+from mmcv import Config
+
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet3D visualize the results')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('--result', help='results file in pickle format')
+ parser.add_argument(
+ '--show-dir', help='directory where visualize results will be saved')
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ if args.result is not None and \
+ not args.result.endswith(('.pkl', '.pickle')):
+ raise ValueError('The results file must be a pkl file.')
+
+ cfg = Config.fromfile(args.config)
+ cfg.data.test.test_mode = True
+
+ # build the dataset
+ dataset = build_dataset(cfg.data.test)
+ results = mmcv.load(args.result)
+
+ if getattr(dataset, 'show', None) is not None:
+ # data loading pipeline for showing
+ eval_pipeline = cfg.get('eval_pipeline', {})
+ if eval_pipeline:
+ dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+ else:
+ dataset.show(results, args.show_dir) # use default pipeline
+ else:
+ raise NotImplementedError(
+ 'Show is not implemented for dataset {}!'.format(
+ type(dataset).__name__))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/__init__.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/__init__.py
new file mode 100644
index 0000000..0ead209
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/__init__.py
@@ -0,0 +1 @@
+from .hooks import *
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/__init__.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/__init__.py
new file mode 100644
index 0000000..15dff22
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/mmdet_train.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/mmdet_train.py
new file mode 100644
index 0000000..1a218f0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/mmdet_train.py
@@ -0,0 +1,203 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+ Fp16OptimizerHook, OptimizerHook,
+ build_runner, )
+from mmcv.optims import build_optimizer
+from mmcv.utils import build_from_cfg
+
+from mmcv.core import EvalHook
+
+from mmcv.datasets import (build_dataset, replace_ImageToTensor)
+from mmcv.utils import get_root_logger, get_dist_info
+import time
+import os.path as osp
+from mmcv.datasets import build_dataloader
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+from adzoo.bevformer.apis.test import custom_multi_gpu_test
+
+def custom_train_detector(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ eval_model=None,
+ meta=None):
+ logger = get_root_logger(cfg.log_level)
+
+ # prepare data loaders
+
+ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+ #assert len(dataset)==1s
+ if 'imgs_per_gpu' in cfg.data:
+ logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+ 'Please use "samples_per_gpu" instead')
+ if 'samples_per_gpu' in cfg.data:
+ logger.warning(
+ f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+ f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+ f'={cfg.data.imgs_per_gpu} is used in this experiments')
+ else:
+ logger.warning(
+ 'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+ f'{cfg.data.imgs_per_gpu} in this experiments')
+ cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+ data_loaders = [
+ build_dataloader(
+ ds,
+ cfg.data.samples_per_gpu,
+ cfg.data.workers_per_gpu,
+ # cfg.gpus will be ignored if distributed
+ len(cfg.gpu_ids),
+ dist=distributed,
+ seed=cfg.seed,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ ) for ds in dataset
+ ]
+
+ # import ipdb
+ # ipdb.set_trace()
+ # put model on gpus
+ if distributed:
+ find_unused_parameters = cfg.get('find_unused_parameters', False)
+ # Sets the `find_unused_parameters` parameter in
+ # torch.nn.parallel.DistributedDataParallel
+ model = DistributedDataParallel(
+ model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters)
+ if eval_model is not None:
+ eval_model = DistributedDataParallel(
+ eval_model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters)
+ else:
+ model = DataParallel(
+ model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+ if eval_model is not None:
+ eval_model = DataParallel(
+ eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+ # build runner
+ optimizer = build_optimizer(model, cfg.optimizer)
+
+ if 'runner' not in cfg:
+ cfg.runner = {
+ 'type': 'EpochBasedRunner',
+ 'max_epochs': cfg.total_epochs
+ }
+ warnings.warn(
+ 'config is now expected to have a `runner` section, '
+ 'please set `runner` in your config.', UserWarning)
+ else:
+ if 'total_epochs' in cfg:
+ assert cfg.total_epochs == cfg.runner.max_epochs
+ if eval_model is not None:
+ runner = build_runner(
+ cfg.runner,
+ default_args=dict(
+ model=model,
+ eval_model=eval_model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+ else:
+ runner = build_runner(
+ cfg.runner,
+ default_args=dict(
+ model=model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+
+ # an ugly workaround to make .log and .log.json filenames the same
+ runner.timestamp = timestamp
+
+ # fp16 setting
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ optimizer_config = Fp16OptimizerHook(
+ **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+ elif distributed and 'type' not in cfg.optimizer_config:
+ optimizer_config = OptimizerHook(**cfg.optimizer_config)
+ else:
+ optimizer_config = cfg.optimizer_config
+
+ # register hooks
+ runner.register_training_hooks(cfg.lr_config, optimizer_config,
+ cfg.checkpoint_config, cfg.log_config,
+ cfg.get('momentum_config', None))
+
+ # register profiler hook
+ #trace_config = dict(type='tb_trace', dir_name='work_dir')
+ #profiler_config = dict(on_trace_ready=trace_config)
+ #runner.register_profiler_hook(profiler_config)
+
+ if distributed:
+ if isinstance(runner, EpochBasedRunner):
+ runner.register_hook(DistSamplerSeedHook())
+
+ # register eval hooks
+ if validate:
+ # Support batch_size > 1 in validation
+ val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+ if val_samples_per_gpu > 1:
+ assert False
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.val.pipeline = replace_ImageToTensor(
+ cfg.data.val.pipeline)
+ val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+ val_dataloader = build_dataloader(
+ val_dataset,
+ samples_per_gpu=val_samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ )
+ eval_cfg = cfg.get('evaluation', {})
+ eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+ eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+ eval_hook = CustomDistEvalHook if distributed else EvalHook
+ runner.register_hook(eval_hook(val_dataloader, test_fn=custom_multi_gpu_test, **eval_cfg))
+
+ # user-defined hooks
+ if cfg.get('custom_hooks', None):
+ custom_hooks = cfg.custom_hooks
+ assert isinstance(custom_hooks, list), \
+ f'custom_hooks expect list type, but got {type(custom_hooks)}'
+ for hook_cfg in cfg.custom_hooks:
+ assert isinstance(hook_cfg, dict), \
+ 'Each item in custom_hooks expects dict type, but got ' \
+ f'{type(hook_cfg)}'
+ hook_cfg = hook_cfg.copy()
+ priority = hook_cfg.pop('priority', 'NORMAL')
+ hook = build_from_cfg(hook_cfg, HOOKS)
+ runner.register_hook(hook, priority=priority)
+
+ if cfg.resume_from:
+ runner.resume(cfg.resume_from)
+ elif cfg.load_from:
+ runner.load_checkpoint(cfg.load_from)
+ runner.run(data_loaders, cfg.workflow)
+
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/test.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/test.py
new file mode 100644
index 0000000..cd507e4
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/test.py
@@ -0,0 +1,164 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+ """Encode bitmap mask to RLE code. Semantic Masks only
+ Args:
+ mask_results (list | tuple[list]): bitmap mask results.
+ In mask scoring rcnn, mask_results is a tuple of (segm_results,
+ segm_cls_score).
+ Returns:
+ list | tuple: RLE encoded mask.
+ """
+ cls_segms = mask_results
+ num_classes = len(cls_segms)
+ encoded_mask_results = []
+ for i in range(len(cls_segms)):
+ encoded_mask_results.append(
+ mask_util.encode(
+ np.array(
+ cls_segms[i][:, :, np.newaxis], order='F',
+ dtype='uint8'))[0]) # encoded with RLE
+ return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+ """Test model with multiple gpus.
+ This method tests model with multiple gpus and collects the results
+ under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+ it encodes results to gpu tensors and use gpu communication for results
+ collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+ and collects them by the rank 0 worker.
+ Args:
+ model (nn.Module): Model to be tested.
+ data_loader (nn.Dataloader): Pytorch data loader.
+ tmpdir (str): Path of directory to save the temporary results from
+ different gpus under cpu mode.
+ gpu_collect (bool): Option to use either gpu or cpu to collect results.
+ Returns:
+ list: The prediction results.
+ """
+ model.eval()
+ bbox_results = []
+ mask_results = []
+ dataset = data_loader.dataset
+ rank, world_size = get_dist_info()
+ if rank == 0:
+ prog_bar = mmcv.ProgressBar(len(dataset))
+ time.sleep(2) # This line can prevent deadlock problem in some cases.
+ have_mask = False
+ for i, data in enumerate(data_loader):
+ with torch.no_grad():
+ result = model(return_loss=False, rescale=True, **data)
+ # encode mask results
+ if isinstance(result, dict):
+ if 'bbox_results' in result.keys():
+ bbox_result = result['bbox_results']
+ batch_size = len(result['bbox_results'])
+ bbox_results.extend(bbox_result)
+ if 'mask_results' in result.keys() and result['mask_results'] is not None:
+ mask_result = custom_encode_mask_results(result['mask_results'])
+ mask_results.extend(mask_result)
+ have_mask = True
+ else:
+ batch_size = len(result)
+ bbox_results.extend(result)
+
+ #if isinstance(result[0], tuple):
+ # assert False, 'this code is for instance segmentation, which our code will not utilize.'
+ # result = [(bbox_results, encode_mask_results(mask_results))
+ # for bbox_results, mask_results in result]
+ if rank == 0:
+
+ for _ in range(batch_size * world_size):
+ prog_bar.update()
+
+ # collect results from all ranks
+ if gpu_collect:
+ bbox_results = collect_results_gpu(bbox_results, len(dataset))
+ if have_mask:
+ mask_results = collect_results_gpu(mask_results, len(dataset))
+ else:
+ mask_results = None
+ else:
+ bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+ tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+ if have_mask:
+ mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+ else:
+ mask_results = None
+
+ if mask_results is None:
+ return bbox_results
+ return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+ rank, world_size = get_dist_info()
+ # create a tmp dir if it is not specified
+ if tmpdir is None:
+ MAX_LEN = 512
+ # 32 is whitespace
+ dir_tensor = torch.full((MAX_LEN, ),
+ 32,
+ dtype=torch.uint8,
+ device='cuda')
+ if rank == 0:
+ mmcv.mkdir_or_exist('.dist_test')
+ tmpdir = tempfile.mkdtemp(dir='.dist_test')
+ tmpdir = torch.tensor(
+ bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+ dir_tensor[:len(tmpdir)] = tmpdir
+ dist.broadcast(dir_tensor, 0)
+ tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+ else:
+ mmcv.mkdir_or_exist(tmpdir)
+ # dump the part result to the dir
+ mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+ dist.barrier()
+ # collect all parts
+ if rank != 0:
+ return None
+ else:
+ # load results of all parts from tmp dir
+ part_list = []
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ part_list.append(mmcv.load(part_file))
+ # sort the results
+ ordered_results = []
+ '''
+ bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+ '''
+ #for res in zip(*part_list):
+ for res in part_list:
+ ordered_results.extend(list(res))
+ # the dataloader may pad some samples
+ ordered_results = ordered_results[:size]
+ # remove tmp dir
+ shutil.rmtree(tmpdir)
+ return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+ collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/train.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/train.py
new file mode 100644
index 0000000..dcae402
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/train.py
@@ -0,0 +1,65 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+
+def custom_train_model(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ eval_model=None,
+ meta=None):
+ """A function wrapper for launching model training according to cfg.
+
+ Because we need different eval_hook in runner. Should be deprecated in the
+ future.
+ """
+ if cfg.model.type in ['EncoderDecoder3D']:
+ assert False
+ else:
+ custom_train_detector(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ eval_model=eval_model,
+ meta=meta)
+
+
+def train_model(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ meta=None):
+ """A function wrapper for launching model training according to cfg.
+
+ Because we need different eval_hook in runner. Should be deprecated in the
+ future.
+ """
+ if cfg.model.type in ['EncoderDecoder3D']:
+ train_segmentor(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ meta=meta)
+ else:
+ train_detector(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ meta=meta)
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/__init__.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/__init__.py
new file mode 100644
index 0000000..aa04ec1
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/__init__.py
@@ -0,0 +1 @@
+from .custom_hooks import TransferWeight
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
new file mode 100644
index 0000000..ef1e35d
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
@@ -0,0 +1,12 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class TransferWeight(Hook):
+
+ def __init__(self, every_n_inters=1):
+ self.every_n_inters=every_n_inters
+
+ def after_train_iter(self, runner):
+ if self.every_n_inner_iters(runner, self.every_n_inters):
+ runner.eval_model.load_state_dict(runner.model.state_dict())
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/__init__.py
new file mode 100644
index 0000000..64eaac4
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/__init__.py
@@ -0,0 +1 @@
+from .modeling import *
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/nuscenes.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/nuscenes.py
new file mode 100644
index 0000000..9eed59b
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/nuscenes.py
@@ -0,0 +1,360 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+#import functools
+from collections import OrderedDict
+
+import numpy as np
+import seaborn as sns
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+#from detectron2.data import MetadataCatalog
+from mmcv.structures import BoxMode
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.splits import create_splits_scenes
+
+#from tridet.data import collect_dataset_dicts
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.pose import Pose
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
+
+# https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
+# - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
+# - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
+# - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
+# tracker if required
+DATASET_NAME_TO_VERSION = {
+ "nusc_train": "v1.0-trainval",
+ "nusc_val": "v1.0-trainval",
+ "nusc_val-subsample-8": "v1.0-trainval",
+ "nusc_trainval": "v1.0-trainval",
+ "nusc_test": "v1.0-test",
+ "nusc_mini_train": "v1.0-mini",
+ "nusc_mini_val": "v1.0-mini",
+}
+
+CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
+
+ATTRIBUTE_IDS = {
+ 'vehicle.moving': 0,
+ 'vehicle.parked': 1,
+ 'vehicle.stopped': 2,
+ 'pedestrian.moving': 0,
+ 'pedestrian.standing': 1,
+ 'pedestrian.sitting_lying_down': 2,
+ 'cycle.with_rider': 0,
+ 'cycle.without_rider': 1,
+}
+
+CATEGORY_IDS = OrderedDict({
+ 'barrier': 0,
+ 'bicycle': 1,
+ 'bus': 2,
+ 'car': 3,
+ 'construction_vehicle': 4,
+ 'motorcycle': 5,
+ 'pedestrian': 6,
+ 'traffic_cone': 7,
+ 'trailer': 8,
+ 'truck': 9,
+})
+
+COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
+COLORMAP = OrderedDict({
+ 'barrier': COLORS[8], # yellow
+ 'bicycle': COLORS[0], # blue
+ 'bus': COLORS[6], # pink
+ 'car': COLORS[2], # green
+ 'construction_vehicle': COLORS[7], # gray
+ 'motorcycle': COLORS[4], # purple
+ 'pedestrian': COLORS[1], # orange
+ 'traffic_cone': COLORS[3], # red
+ 'trailer': COLORS[9], # skyblue
+ 'truck': COLORS[5], # brown
+})
+
+MAX_NUM_ATTRIBUTES = 3
+
+
+def _compute_iou(box1, box2):
+ """
+ Parameters
+ ----------
+ box1, box2:
+ (x1, y1, x2, y2)
+ """
+ xx1 = max(box1[0], box2[0])
+ yy1 = max(box1[1], box2[1])
+ xx2 = min(box1[2], box2[2])
+ yy2 = min(box1[3], box2[3])
+ if xx1 >= xx2 or yy1 >= yy2:
+ return 0.
+ inter = (xx2 - xx1) * (yy2 - yy1)
+ a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+ a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+ return inter / (a1 + a2 - inter)
+
+
+class NuscenesDataset(Dataset):
+ def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
+ self.data_root = data_root
+ assert name in DATASET_NAME_TO_VERSION
+ version = DATASET_NAME_TO_VERSION[name]
+ self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
+
+ self.datum_names = datum_names
+ self.min_num_lidar_points = min_num_lidar_points
+ self.min_box_visibility = min_box_visibility
+
+ self.dataset_item_info = self._build_dataset_item_info(name)
+
+ # Index instance tokens to their IDs
+ self._instance_token_to_id = self._index_instance_tokens()
+
+ # Construct the mapping from datum_token (image id) to index
+ print("Generating the mapping from image id to idx...")
+ self.datumtoken2idx = {}
+ for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
+ self.datumtoken2idx[datum_token] = idx
+ print("Done.")
+
+ def _build_dataset_item_info(self, name):
+ scenes_in_split = self._get_split_scenes(name)
+
+ dataset_items = []
+ for _, scene_token in tqdm(scenes_in_split):
+ scene = self.nusc.get('scene', scene_token)
+ sample_token = scene['first_sample_token']
+ for sample_idx in range(scene['nbr_samples']):
+ if name.endswith('subsample-8') and sample_idx % 8 > 0:
+ # Sample-level subsampling.
+ continue
+
+ sample = self.nusc.get('sample', sample_token)
+ for datum_name, datum_token in sample['data'].items():
+ if datum_name not in self.datum_names:
+ continue
+ dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
+ sample_token = sample['next']
+ return dataset_items
+
+ def _get_split_scenes(self, name):
+ scenes_in_splits = create_splits_scenes()
+ if name == "nusc_trainval":
+ scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
+ elif name == "nusc_val-subsample-8":
+ scenes = scenes_in_splits["val"]
+ else:
+ assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
+ split = name[5:]
+ assert split in scenes_in_splits, f"Invalid dataset: {split}"
+ scenes = scenes_in_splits[split]
+
+ # Mapping from scene name to token.
+ name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
+ return [(name, name_to_token[name]) for name in scenes]
+
+ def __len__(self):
+ return len(self.dataset_item_info)
+
+ def _build_id(self, scene_name, sample_idx, datum_name):
+ sample_id = f"{scene_name}_{sample_idx:03d}"
+ image_id = f"{sample_id}_{datum_name}"
+ return image_id, sample_id
+
+ def _index_instance_tokens(self):
+ """Index instance tokens for uniquely identifying instances across samples"""
+ instance_token_to_id = {}
+ for record in self.nusc.sample_annotation:
+ instance_token = record['instance_token']
+ if instance_token not in instance_token_to_id:
+ next_instance_id = len(instance_token_to_id)
+ instance_token_to_id[instance_token] = next_instance_id
+ return instance_token_to_id
+
+ def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
+ annotations = []
+ for _ann in annotation_list:
+ ann = self.nusc.get('sample_annotation', _ann.token)
+ if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
+ continue
+ annotation = OrderedDict()
+
+ # --------
+ # Category
+ # --------
+ category = category_to_detection_name(ann['category_name'])
+ if category is None:
+ continue
+ annotation['category_id'] = CATEGORY_IDS[category]
+
+ # ------
+ # 3D box
+ # ------
+ # NOTE: ann['rotation'], ann['translation'] is in global frame.
+ pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center) # pose in sensor frame
+ # DEBUG:
+ # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
+ # pose_WO_2 = pose_WS * pose_SO
+ # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
+ bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
+ annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
+
+ # --------------------------------------
+ # 2D box -- project 8 corners of 3D bbox
+ # --------------------------------------
+ corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
+ l, t = corners[:, 0].min(), corners[:, 1].min()
+ r, b = corners[:, 0].max(), corners[:, 1].max()
+
+ x1 = max(0, l)
+ y1 = max(0, t)
+ x2 = min(image_shape[1], r)
+ y2 = min(image_shape[0], b)
+
+ iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
+ if iou < self.min_box_visibility:
+ continue
+
+ annotation['bbox'] = [x1, y1, x2, y2]
+ annotation['bbox_mode'] = BoxMode.XYXY_ABS
+
+ # --------
+ # Track ID
+ # --------
+ annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
+
+ # ---------
+ # Attribute
+ # ---------
+ attr_tokens = ann['attribute_tokens']
+ assert len(attr_tokens) < 2 # NOTE: Allow only single attrubute.
+ attribute_id = MAX_NUM_ATTRIBUTES # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
+ if attr_tokens:
+ attribute = self.nusc.get('attribute', attr_tokens[0])['name']
+ attribute_id = ATTRIBUTE_IDS[attribute]
+ annotation['attribute_id'] = attribute_id
+
+ # -----
+ # Speed
+ # -----
+ vel_global = self.nusc.box_velocity(ann['token'])
+ speed = np.linalg.norm(vel_global) # NOTE: This can be NaN.
+ # DEBUG:
+ # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
+ annotation['speed'] = speed
+
+ annotations.append(annotation)
+
+ return annotations
+
+ def _get_ego_velocity(self, current, max_time_diff=1.5):
+ """Velocity of ego-vehicle in m/s.
+ """
+ has_prev = current['prev'] != ''
+ has_next = current['next'] != ''
+
+ # Cannot estimate velocity for a single annotation.
+ if not has_prev and not has_next:
+ return np.array([np.nan, np.nan, np.nan])
+
+ if has_prev:
+ first = self.nusc.get('sample_data', current['prev'])
+ else:
+ first = current
+
+ if has_next:
+ last = self.nusc.get('sample_data', current['next'])
+ else:
+ last = current
+
+ pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
+ pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
+ pos_diff = np.float32(pos_last) - np.float32(pos_first)
+
+ time_last = 1e-6 * last['timestamp']
+ time_first = 1e-6 * first['timestamp']
+ time_diff = time_last - time_first
+
+ if has_next and has_prev:
+ # If doing centered difference, allow for up to double the max_time_diff.
+ max_time_diff *= 2
+
+ if time_diff > max_time_diff:
+ # If time_diff is too big, don't return an estimate.
+ return np.array([np.nan, np.nan, np.nan])
+ else:
+ return pos_diff / time_diff
+
+ def __getitem__(self, idx):
+ datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
+ datum = self.nusc.get('sample_data', datum_token)
+ assert datum['is_key_frame']
+
+ filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+ image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
+ height, width = datum['height'], datum['width']
+ d2_dict = OrderedDict(
+ file_name=filename,
+ height=height,
+ width=width,
+ image_id=image_id,
+ sample_id=sample_id,
+ sample_token=sample_token
+ )
+
+ # Intrinsics
+ d2_dict['intrinsics'] = list(K.flatten())
+
+ # Get pose of the sensor (S) from vehicle (V) frame
+ _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+ pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+
+ # Get ego-pose of the vehicle (V) from global/world (W) frame
+ _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+ pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+ pose_WS = pose_WV * pose_VS
+
+ d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+ d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+ d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+ d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+
+ return d2_dict
+
+ def getitem_by_datumtoken(self, datum_token):
+ # idx = self.datumtoken2idx[datum_token]
+ # ret = self.__getitem__(idx)
+
+ datum = self.nusc.get('sample_data', datum_token)
+ sample_token = datum['sample_token']
+ filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+ height, width = datum['height'], datum['width']
+ d2_dict = OrderedDict(
+ file_name=filename,
+ height=height,
+ width=width,
+ image_id=0,
+ sample_id=0,
+ sample_token=sample_token
+ )
+ # Intrinsics
+ d2_dict['intrinsics'] = list(K.flatten())
+ # Get pose of the sensor (S) from vehicle (V) frame
+ _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+ pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+ # Get ego-pose of the vehicle (V) from global/world (W) frame
+ _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+ pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+ pose_WS = pose_WV * pose_VS
+
+ d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+ d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+ d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+ d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+ return d2_dict
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/transform_utils.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/transform_utils.py
new file mode 100644
index 0000000..623bd6e
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/transform_utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+# Adapted from detectron2:
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py
+import numpy as np
+import torch
+
+from detectron2.data import transforms as T
+from detectron2.structures import Boxes, BoxMode, Instances
+
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+__all__ = ["transform_instance_annotations", "annotations_to_instances"]
+
+
+def transform_instance_annotations(
+ annotation,
+ transforms,
+ image_size,
+):
+ """Adapted from:
+ https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254
+
+ The changes from original:
+ - The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional.
+ - Add optional 3D bounding box support.
+ - If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory.
+
+ ===============================================================================================================
+
+ Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+ It will use `transforms.apply_box` for the box, and
+ `transforms.apply_coords` for segmentation polygons & keypoints.
+ If you need anything more specially designed for each data structure,
+ you'll need to implement your own version of this function or the transforms.
+
+ Args:
+ annotation (dict): dict of instance annotations for a single instance.
+ It will be modified in-place.
+ transforms (TransformList or list[Transform]):
+ image_size (tuple): the height, width of the transformed image
+ keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+ Returns:
+ dict:
+ the same input dict with fields "bbox", "segmentation", "keypoints"
+ transformed according to `transforms`.
+ The "bbox_mode" field will be set to XYXY_ABS.
+ """
+ if isinstance(transforms, (tuple, list)):
+ transforms = T.TransformList(transforms)
+ # (dennis.park) Here 2D bounding box is optional.
+ if "bbox" in annotation:
+ assert "bbox_mode" in annotation, "'bbox' is present, but 'bbox_mode' is not."
+ # bbox is 1d (per-instance bounding box)
+ bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+ bbox = transforms.apply_box(np.array([bbox]))[0]
+ # clip transformed bbox to image size
+ bbox = bbox.clip(min=0)
+ bbox = np.minimum(bbox, list(image_size + image_size)[::-1])
+ annotation["bbox"] = bbox
+ annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+ # Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed.
+ if "bbox3d" in annotation:
+ bbox3d = np.array(annotation["bbox3d"])
+ annotation['bbox3d'] = transforms.apply_box3d(bbox3d)
+
+ return annotation
+
+
+def _create_empty_instances(image_size):
+ target = Instances(image_size)
+
+ target.gt_boxes = Boxes([])
+ target.gt_classes = torch.tensor([], dtype=torch.int64)
+ target.gt_boxes3d = Boxes3D.from_vectors([], torch.eye(3, dtype=torch.float32))
+
+ return target
+
+
+def annotations_to_instances(
+ annos,
+ image_size,
+ intrinsics=None,
+):
+ """
+ Create an :class:`Instances` object used by the models,
+ from instance annotations in the dataset dict.
+
+ Args:
+ annos (list[dict]): a list of instance annotations in one image, each
+ element for one instance.
+ image_size (tuple): height, width
+
+ Returns:
+ Instances:
+ It will contain fields "gt_boxes", "gt_classes",
+ "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+ This is the format that builtin models expect.
+ """
+ if len(annos) == 0:
+ return _create_empty_instances(image_size)
+
+ boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+ target = Instances(image_size)
+ target.gt_boxes = Boxes(boxes)
+
+ classes = [obj["category_id"] for obj in annos]
+ classes = torch.tensor(classes, dtype=torch.int64)
+ target.gt_classes = classes
+
+ if len(annos) and "bbox3d" in annos[0]:
+ assert intrinsics is not None
+ target.gt_boxes3d = Boxes3D.from_vectors([anno['bbox3d'] for anno in annos], intrinsics)
+ if len(target.gt_boxes3d) != target.gt_boxes.tensor.shape[0]:
+ raise ValueError(
+ f"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a={len(target.gt_boxes3d)}, b={target.gt_boxes.tensor.shape[0]}."
+ )
+
+ # NOTE: add nuscenes attributes here
+ # NOTE: instances will be filtered later
+ # NuScenes attributes
+ if len(annos) and "attribute_id" in annos[0]:
+ attributes = [obj["attribute_id"] for obj in annos]
+ target.gt_attributes = torch.tensor(attributes, dtype=torch.int64)
+
+ # Speed (magnitude of velocity)
+ if len(annos) and "speed" in annos[0]:
+ speeds = [obj["speed"] for obj in annos]
+ target.gt_speeds = torch.tensor(speeds, dtype=torch.float32)
+
+ assert len(boxes) == len(classes) == len(attributes) == len(speeds), \
+ 'the numbers of annotations should be the same'
+ return target
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/iou_loss.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/iou_loss.py
new file mode 100644
index 0000000..97638ef
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/iou_loss.py
@@ -0,0 +1,71 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+# Adapted from AdelaiDet:
+# https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py
+import torch
+from torch import nn
+
+
+class IOULoss(nn.Module):
+ """
+ Intersetion Over Union (IoU) loss which supports three
+ different IoU computations:
+
+ * IoU
+ * Linear IoU
+ * gIoU
+ """
+ def __init__(self, loc_loss_type='iou'):
+ super(IOULoss, self).__init__()
+ self.loc_loss_type = loc_loss_type
+
+ def forward(self, pred, target, weight=None):
+ """
+ Args:
+ pred: Nx4 predicted bounding boxes
+ target: Nx4 target bounding boxes
+ weight: N loss weight for each instance
+ """
+ pred_left = pred[:, 0]
+ pred_top = pred[:, 1]
+ pred_right = pred[:, 2]
+ pred_bottom = pred[:, 3]
+
+ target_left = target[:, 0]
+ target_top = target[:, 1]
+ target_right = target[:, 2]
+ target_bottom = target[:, 3]
+
+ target_aera = (target_left + target_right) * \
+ (target_top + target_bottom)
+ pred_aera = (pred_left + pred_right) * \
+ (pred_top + pred_bottom)
+
+ w_intersect = torch.min(pred_left, target_left) + \
+ torch.min(pred_right, target_right)
+ h_intersect = torch.min(pred_bottom, target_bottom) + \
+ torch.min(pred_top, target_top)
+
+ g_w_intersect = torch.max(pred_left, target_left) + \
+ torch.max(pred_right, target_right)
+ g_h_intersect = torch.max(pred_bottom, target_bottom) + \
+ torch.max(pred_top, target_top)
+ ac_uion = g_w_intersect * g_h_intersect
+
+ area_intersect = w_intersect * h_intersect
+ area_union = target_aera + pred_aera - area_intersect
+
+ ious = (area_intersect + 1.0) / (area_union + 1.0)
+ gious = ious - (ac_uion - area_union) / ac_uion
+ if self.loc_loss_type == 'iou':
+ losses = -torch.log(ious)
+ elif self.loc_loss_type == 'linear_iou':
+ losses = 1 - ious
+ elif self.loc_loss_type == 'giou':
+ losses = 1 - gious
+ else:
+ raise NotImplementedError
+
+ if weight is not None:
+ return (losses * weight).sum()
+ else:
+ return losses.sum()
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/normalization.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/normalization.py
new file mode 100644
index 0000000..bed7c63
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/normalization.py
@@ -0,0 +1,40 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+# Adapted from AdelaiDet
+# https://github.com/aim-uofa/AdelaiDet/
+import logging
+
+import torch
+from torch import nn
+
+LOG = logging.getLogger(__name__)
+
+
+class Scale(nn.Module):
+ def __init__(self, init_value=1.0):
+ super(Scale, self).__init__()
+ self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+
+ def forward(self, input):
+ return input * self.scale
+
+
+class Offset(nn.Module):
+ def __init__(self, init_value=0.):
+ super(Offset, self).__init__()
+ self.bias = nn.Parameter(torch.FloatTensor([init_value]))
+
+ def forward(self, input):
+ return input + self.bias
+
+
+class ModuleListDial(nn.ModuleList):
+ def __init__(self, modules=None):
+ super(ModuleListDial, self).__init__(modules)
+ self.cur_position = 0
+
+ def forward(self, x):
+ result = self[self.cur_position](x)
+ self.cur_position += 1
+ if self.cur_position >= len(self):
+ self.cur_position = 0
+ return result
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
new file mode 100644
index 0000000..b5448d0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+# Adapted from fvcore:
+# https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py
+
+import torch
+
+
+def smooth_l1_loss(input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none") -> torch.Tensor:
+ """
+ Smooth L1 loss defined in the Fast R-CNN paper as:
+
+ | 0.5 * x ** 2 / beta if abs(x) < beta
+ smoothl1(x) = |
+ | abs(x) - 0.5 * beta otherwise,
+
+ where x = input - target.
+
+ Smooth L1 loss is related to Huber loss, which is defined as:
+
+ | 0.5 * x ** 2 if abs(x) < beta
+ huber(x) = |
+ | beta * (abs(x) - 0.5 * beta) otherwise
+
+ Smooth L1 loss is equal to huber(x) / beta. This leads to the following
+ differences:
+
+ - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
+ converges to a constant 0 loss.
+ - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
+ converges to L2 loss.
+ - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
+ slope of 1. For Huber loss, the slope of the L1 segment is beta.
+
+ Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
+ portion replaced with a quadratic function such that at abs(x) = beta, its
+ slope is 1. The quadratic segment smooths the L1 loss near x = 0.
+
+ Args:
+ input (Tensor): input tensor of any shape
+ target (Tensor): target value tensor with the same shape as input
+ beta (float): L1 to L2 change point.
+ For beta values < 1e-5, L1 loss is computed.
+ reduction: 'none' | 'mean' | 'sum'
+ 'none': No reduction will be applied to the output.
+ 'mean': The output will be averaged.
+ 'sum': The output will be summed.
+
+ Returns:
+ The loss with the reduction option applied.
+
+ Note:
+ PyTorch's builtin "Smooth L1 loss" implementation does not actually
+ implement Smooth L1 loss, nor does it implement Huber loss. It implements
+ the special case of both in which they are equal (beta=1).
+ See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
+ """
+ # (dennis.park) Make it work with mixed precision training.
+ beta = torch.as_tensor(beta).to(input.dtype)
+ if beta < 1e-5:
+ # if beta == 0, then torch.where will result in nan gradients when
+ # the chain rule is applied due to pytorch implementation details
+ # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
+ # zeros, rather than "no gradient"). To avoid this issue, we define
+ # small values of beta to be exactly l1 loss.
+ loss = torch.abs(input - target)
+ else:
+ n = torch.abs(input - target)
+ cond = n < beta
+ a = 0.5 * n**2
+ b = n - 0.5 * beta
+ a, b = a.to(input.dtype), b.to(input.dtype)
+ loss = torch.where(cond, a, b)
+ # loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+
+ if reduction == "mean":
+ loss = loss.mean()
+ elif reduction == "sum":
+ loss = loss.sum()
+ return loss
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/__init__.py
new file mode 100644
index 0000000..dd76a61
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/__init__.py
@@ -0,0 +1 @@
+from .nuscenes_dd3d import NuscenesDD3D
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/core.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/core.py
new file mode 100644
index 0000000..4830248
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/core.py
@@ -0,0 +1,217 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import torch
+from torch import nn
+
+#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from mmcv.modeling.postprocessing import detector_postprocess as resize_instances
+from mmcv.structures import Instances
+from mmcv.layers import ShapeSpec
+from mmcv.utils import force_fp32
+
+from .fcos2d import FCOS2DHead, FCOS2DInference, FCOS2DLoss
+from .fcos3d import FCOS3DHead, FCOS3DInference, FCOS3DLoss
+#from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate
+from .prepare_targets import DD3DTargetPreparer
+#from tridet.modeling.feature_extractor import build_feature_extractor
+from ..structures.image_list import ImageList
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.tensor2d import compute_features_locations as compute_locations_per_level
+
+
+#@META_ARCH_REGISTRY.register()
+class DD3D(nn.Module):
+ def __init__(self,
+ num_classes,
+ in_channels,
+ strides,
+ fcos2d_cfg=dict(),
+ fcos2d_loss_cfg=dict(),
+ fcos3d_cfg=dict(),
+ fcos3d_loss_cfg=dict(),
+ target_assign_cfg=dict(),
+ box3d_on=True,
+ feature_locations_offset="none"):
+ super().__init__()
+ # NOTE: do not need backbone
+ # self.backbone = build_feature_extractor(cfg)
+ # backbone_output_shape = self.backbone.output_shape()
+ # self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys())
+
+ self.backbone_output_shape = [ShapeSpec(channels=in_channels, stride=s) for s in strides]
+
+ self.feature_locations_offset = feature_locations_offset
+
+ self.fcos2d_head = FCOS2DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+ **fcos2d_cfg)
+ self.fcos2d_loss = FCOS2DLoss(num_classes=num_classes, **fcos2d_loss_cfg)
+ # NOTE: inference later
+ # self.fcos2d_inference = FCOS2DInference(cfg)
+
+ if box3d_on:
+ self.fcos3d_head = FCOS3DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+ **fcos3d_cfg)
+ self.fcos3d_loss = FCOS3DLoss(num_classes=num_classes, **fcos3d_loss_cfg)
+ # NOTE: inference later
+ # self.fcos3d_inference = FCOS3DInference(cfg)
+ self.only_box2d = False
+ else:
+ self.only_box2d = True
+
+ self.prepare_targets = DD3DTargetPreparer(num_classes=num_classes,
+ input_shape=self.backbone_output_shape,
+ box3d_on=box3d_on,
+ **target_assign_cfg)
+
+ # NOTE: inference later
+ # self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS
+
+ # self.do_nms = cfg.DD3D.INFERENCE.DO_NMS
+ # self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS
+ # self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH
+
+ # nuScenes inference aggregates detections over all 6 cameras.
+ # self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE
+ self.num_classes = num_classes
+
+ # NOTE: do not need normalize
+ # self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+ # self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+ # NOTE:
+ # @property
+ # def device(self):
+ # return self.pixel_mean.device
+
+ # def preprocess_image(self, x):
+ # return (x - self.pixel_mean) / self.pixel_std
+
+ @force_fp32(apply_to=('features'))
+ def forward(self, features, batched_inputs):
+ # NOTE:
+ # images = [x["image"].to(self.device) for x in batched_inputs]
+ # images = [self.preprocess_image(x) for x in images]
+
+ # NOTE: directly use inv_intrinsics
+ # if 'intrinsics' in batched_inputs[0]:
+ # intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+ # else:
+ # intrinsics = None
+ # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+ if 'inv_intrinsics' in batched_inputs[0]:
+ inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+ inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+ else:
+ inv_intrinsics = None
+
+ # NOTE:
+ # gt_dense_depth = None
+ # if 'depth' in batched_inputs[0]:
+ # gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+ # gt_dense_depth = ImageList.from_tensors(
+ # gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+ # )
+
+ # NOTE: directly input feature
+ # features = self.backbone(images.tensor)
+ # features = [features[f] for f in self.in_features]
+
+ if "instances" in batched_inputs[0]:
+ gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+ else:
+ gt_instances = None
+
+ locations = self.compute_locations(features)
+ logits, box2d_reg, centerness, _ = self.fcos2d_head(features)
+ if not self.only_box2d:
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+ # NOTE: directly use inv_intrinsics
+ # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+ if self.training:
+ assert gt_instances is not None
+ feature_shapes = [x.shape[-2:] for x in features]
+ training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+ # NOTE:
+ # if gt_dense_depth is not None:
+ # training_targets.update({"dense_depth": gt_dense_depth})
+
+ losses = {}
+ fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+ losses.update(fcos2d_loss)
+
+ if not self.only_box2d:
+ fcos3d_loss = self.fcos3d_loss(
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+ fcos2d_info, training_targets
+ )
+ losses.update(fcos3d_loss)
+ return losses
+ else:
+ # TODO: do not support inference now
+ raise NotImplementedError
+
+ pred_instances, fcos2d_info = self.fcos2d_inference(
+ logits, box2d_reg, centerness, locations, images.image_sizes
+ )
+ if not self.only_box2d:
+ # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place.
+ self.fcos3d_inference(
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+ fcos2d_info
+ )
+
+ # 3D score == 2D score x confidence.
+ score_key = "scores_3d"
+ else:
+ score_key = "scores"
+
+ # Transpose to "image-first", i.e. (B, L)
+ pred_instances = list(zip(*pred_instances))
+ pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+ # 2D NMS and pick top-K.
+ if self.do_nms:
+ pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+ if not self.only_box2d and self.do_bev_nms:
+ # Bird-eye-view NMS.
+ dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+ if 'pose' in batched_inputs[0]:
+ poses = [x['pose'] for x in batched_inputs]
+ else:
+ poses = [x['extrinsics'] for x in batched_inputs]
+ pred_instances = nuscenes_sample_aggregate(
+ pred_instances,
+ dummy_group_idxs,
+ self.num_classes,
+ poses,
+ iou_threshold=self.bev_nms_iou_thresh,
+ include_boxes3d_global=False
+ )
+
+ if self.postprocess_in_inference:
+ processed_results = []
+ for results_per_image, input_per_image, image_size in \
+ zip(pred_instances, batched_inputs, images.image_sizes):
+ height = input_per_image.get("height", image_size[0])
+ width = input_per_image.get("width", image_size[1])
+ r = resize_instances(results_per_image, height, width)
+ processed_results.append({"instances": r})
+ else:
+ processed_results = [{"instances": x} for x in pred_instances]
+
+ return processed_results
+
+ def compute_locations(self, features):
+ locations = []
+ in_strides = [x.stride for x in self.backbone_output_shape]
+ for level, feature in enumerate(features):
+ h, w = feature.size()[-2:]
+ locations_per_level = compute_locations_per_level(
+ h, w, in_strides[level], feature.dtype, feature.device, offset=self.feature_locations_offset
+ )
+ locations.append(locations_per_level)
+ return locations
+
+ def forward_train(self, features, batched_inputs):
+ self.train()
+ return self.forward(features, batched_inputs)
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
new file mode 100644
index 0000000..5cdaf0f
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
@@ -0,0 +1,46 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import logging
+
+import torch
+import torch.nn as nn
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.smooth_l1_loss import smooth_l1_loss
+
+LOG = logging.getLogger(__name__)
+
+
+class DisentangledBox3DLoss(nn.Module):
+ def __init__(self, smooth_l1_loss_beta, max_loss_per_group):
+ super().__init__()
+ self.smooth_l1_loss_beta = smooth_l1_loss_beta
+ self.max_loss_per_group = max_loss_per_group
+
+ def forward(self, box3d_pred, box3d_targets, locations, weights=None):
+
+ box3d_pred = box3d_pred.to(torch.float32)
+ box3d_targets = box3d_targets.to(torch.float32)
+
+ target_corners = box3d_targets.corners
+
+ disentangled_losses = {}
+ for component_key in ["quat", "proj_ctr", "depth", "size"]:
+ disentangled_boxes = box3d_targets.clone()
+ setattr(disentangled_boxes, component_key, getattr(box3d_pred, component_key))
+ pred_corners = disentangled_boxes.to(torch.float32).corners
+
+ loss = smooth_l1_loss(pred_corners, target_corners, beta=self.smooth_l1_loss_beta)
+
+ # Bound the loss
+ loss.clamp(max=self.max_loss_per_group)
+
+ if weights is not None:
+ # loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1))
+ loss = torch.sum(loss.reshape(-1, 24).mean(dim=1) * weights)
+ else:
+ loss = loss.reshape(-1, 24).mean()
+
+ disentangled_losses["loss_box3d_" + component_key] = loss
+
+ entangled_l1_dist = (target_corners - box3d_pred.corners).detach().abs().reshape(-1, 24).mean(dim=1)
+
+ return disentangled_losses, entangled_l1_dist
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos2d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos2d.py
new file mode 100644
index 0000000..c9c6c08
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos2d.py
@@ -0,0 +1,388 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+# Adapted from AdelaiDet:
+# https://github.com/aim-uofa/AdelaiDet
+import torch
+from mmcv.losses import sigmoid_focal_loss
+from torch import nn
+from torch.nn import functional as F
+
+from mmcv.layers import batched_nms, get_norm
+from mmcv.structures import Instances, Boxes
+from torch import distributed as dist
+from mmcv.utils import force_fp32
+from mmcv.layers import Conv2d, batched_nms, cat, get_norm
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.iou_loss import IOULoss
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Scale
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000
+
+def get_world_size() -> int:
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size()
+
+def compute_ctrness_targets(reg_targets):
+ if len(reg_targets) == 0:
+ return reg_targets.new_zeros(len(reg_targets))
+ left_right = reg_targets[:, [0, 2]]
+ top_bottom = reg_targets[:, [1, 3]]
+ ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
+ (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+ return torch.sqrt(ctrness)
+
+class FCOS2DHead(nn.Module):
+ def __init__(self,
+ num_classes,
+ input_shape,
+ num_cls_convs=4,
+ num_box_convs=4,
+ norm='BN',
+ use_deformable=False,
+ use_scale=True,
+ box2d_scale_init_factor=1.0,
+ version='v2'):
+ super().__init__()
+
+ self.num_classes = num_classes
+ self.in_strides = [shape.stride for shape in input_shape]
+ self.num_levels = len(input_shape)
+
+ self.use_scale = use_scale
+ self.box2d_scale_init_factor = box2d_scale_init_factor
+
+ self._version = version
+
+ in_channels = [s.channels for s in input_shape]
+ assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+ in_channels = in_channels[0]
+
+ if use_deformable:
+ raise ValueError("Not supported yet.")
+
+ head_configs = {'cls': num_cls_convs, 'box2d': num_box_convs}
+
+ for head_name, num_convs in head_configs.items():
+ tower = []
+ if self._version == "v1":
+ for _ in range(num_convs):
+ conv_func = nn.Conv2d
+ tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
+ if norm == "GN":
+ raise NotImplementedError()
+ elif norm == "NaiveGN":
+ raise NotImplementedError()
+ elif norm == "BN":
+ tower.append(ModuleListDial([nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)]))
+ elif norm == "SyncBN":
+ raise NotImplementedError()
+ tower.append(nn.ReLU())
+ elif self._version == "v2":
+ for _ in range(num_convs):
+ if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+ # NOTE: need to add norm here!
+ # Each FPN level has its own batchnorm layer.
+ # NOTE: do not use dd3d train.py!
+ # "BN" is converted to "SyncBN" in distributed training (see train.py)
+ norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+ else:
+ norm_layer = get_norm(norm, in_channels)
+ tower.append(
+ Conv2d(
+ in_channels,
+ in_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=norm_layer is None,
+ norm=norm_layer,
+ activation=F.relu
+ )
+ )
+ else:
+ raise ValueError(f"Invalid FCOS2D version: {self._version}")
+ self.add_module(f'{head_name}_tower', nn.Sequential(*tower))
+
+ self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
+ self.box2d_reg = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1)
+ self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1)
+
+ if self.use_scale:
+ if self._version == "v1":
+ self.scales_reg = nn.ModuleList([
+ Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+ ])
+ else:
+ self.scales_box2d_reg = nn.ModuleList([
+ Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+ ])
+
+ self.init_weights()
+
+ def init_weights(self):
+
+ for tower in [self.cls_tower, self.box2d_tower]:
+ for l in tower.modules():
+ if isinstance(l, nn.Conv2d):
+ torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+ if l.bias is not None:
+ torch.nn.init.constant_(l.bias, 0)
+
+ predictors = [self.cls_logits, self.box2d_reg, self.centerness]
+
+ for modules in predictors:
+ for l in modules.modules():
+ if isinstance(l, nn.Conv2d):
+ torch.nn.init.kaiming_uniform_(l.weight, a=1)
+ if l.bias is not None: # depth head may not have bias.
+ torch.nn.init.constant_(l.bias, 0)
+
+ def forward(self, x):
+ logits = []
+ box2d_reg = []
+ centerness = []
+
+ extra_output = {"cls_tower_out": []}
+
+ for l, feature in enumerate(x):
+ cls_tower_out = self.cls_tower(feature)
+ bbox_tower_out = self.box2d_tower(feature)
+
+ # 2D box
+ logits.append(self.cls_logits(cls_tower_out))
+ centerness.append(self.centerness(bbox_tower_out))
+ box_reg = self.box2d_reg(bbox_tower_out)
+ if self.use_scale:
+ # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+ if self._version == "v1":
+ box_reg = self.scales_reg[l](box_reg)
+ else:
+ box_reg = self.scales_box2d_reg[l](box_reg)
+ # Note that we use relu, as in the improved FCOS, instead of exp.
+ box2d_reg.append(F.relu(box_reg))
+
+ extra_output['cls_tower_out'].append(cls_tower_out)
+
+ return logits, box2d_reg, centerness, extra_output
+
+
+class FCOS2DLoss(nn.Module):
+ def __init__(self,
+ num_classes,
+ focal_loss_alpha=0.25,
+ focal_loss_gamma=2.0,
+ loc_loss_type='giou',
+ ):
+ super().__init__()
+ self.focal_loss_alpha = focal_loss_alpha
+ self.focal_loss_gamma = focal_loss_gamma
+
+ self.box2d_reg_loss_fn = IOULoss(loc_loss_type)
+
+ self.num_classes = num_classes
+
+ @force_fp32(apply_to=('logits', 'box2d_reg', 'centerness'))
+ def forward(self, logits, box2d_reg, centerness, targets):
+ labels = targets['labels']
+ box2d_reg_targets = targets['box2d_reg_targets']
+ pos_inds = targets["pos_inds"]
+
+ if len(labels) != box2d_reg_targets.shape[0]:
+ raise ValueError(
+ f"The size of 'labels' and 'box2d_reg_targets' does not match: a={len(labels)}, b={box2d_reg_targets.shape[0]}"
+ )
+
+ # Flatten predictions
+ logits = cat([x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits])
+ box2d_reg_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4) for x in box2d_reg])
+ centerness_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in centerness])
+
+ # -------------------
+ # Classification loss
+ # -------------------
+ num_pos_local = pos_inds.numel()
+ num_gpus = get_world_size()
+ total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
+ num_pos_avg = max(total_num_pos / num_gpus, 1.0)
+
+ # prepare one_hot
+ cls_target = torch.zeros_like(logits)
+ cls_target[pos_inds, labels[pos_inds]] = 1
+
+ loss_cls = sigmoid_focal_loss(
+ logits,
+ cls_target,
+ alpha=self.focal_loss_alpha,
+ gamma=self.focal_loss_gamma,
+ reduction="sum",
+ ) / num_pos_avg
+
+ # NOTE: The rest of losses only consider foreground pixels.
+ box2d_reg_pred = box2d_reg_pred[pos_inds]
+ box2d_reg_targets = box2d_reg_targets[pos_inds]
+
+ centerness_pred = centerness_pred[pos_inds]
+
+ # Compute centerness targets here using 2D regression targets of foreground pixels.
+ centerness_targets = compute_ctrness_targets(box2d_reg_targets)
+
+ # Denominator for all foreground losses.
+ ctrness_targets_sum = centerness_targets.sum()
+ loss_denom = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
+
+ # NOTE: change the return after reduce_sum
+ if pos_inds.numel() == 0:
+ losses = {
+ "loss_cls": loss_cls,
+ "loss_box2d_reg": box2d_reg_pred.sum() * 0.,
+ "loss_centerness": centerness_pred.sum() * 0.,
+ }
+ return losses, {}
+
+ # ----------------------
+ # 2D box regression loss
+ # ----------------------
+ loss_box2d_reg = self.box2d_reg_loss_fn(box2d_reg_pred, box2d_reg_targets, centerness_targets) / loss_denom
+
+ # ---------------
+ # Centerness loss
+ # ---------------
+ loss_centerness = F.binary_cross_entropy_with_logits(
+ centerness_pred, centerness_targets, reduction="sum"
+ ) / num_pos_avg
+
+ loss_dict = {"loss_cls": loss_cls, "loss_box2d_reg": loss_box2d_reg, "loss_centerness": loss_centerness}
+ extra_info = {"loss_denom": loss_denom, "centerness_targets": centerness_targets}
+
+ return loss_dict, extra_info
+
+
+class FCOS2DInference():
+ def __init__(self, cfg):
+ self.thresh_with_ctr = cfg.DD3D.FCOS2D.INFERENCE.THRESH_WITH_CTR
+ self.pre_nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_THRESH
+ self.pre_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_TOPK
+ self.post_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.POST_NMS_TOPK
+ self.nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.NMS_THRESH
+ self.num_classes = cfg.DD3D.NUM_CLASSES
+
+ def __call__(self, logits, box2d_reg, centerness, locations, image_sizes):
+
+ pred_instances = [] # List[List[Instances]], shape = (L, B)
+ extra_info = []
+ for lvl, (logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl) in \
+ enumerate(zip(logits, box2d_reg, centerness, locations)):
+
+ instances_per_lvl, extra_info_per_lvl = self.forward_for_single_feature_map(
+ logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl, image_sizes
+ ) # List of Instances; one for each image.
+
+ for instances_per_im in instances_per_lvl:
+ instances_per_im.fpn_levels = locations_lvl.new_ones(len(instances_per_im), dtype=torch.long) * lvl
+
+ pred_instances.append(instances_per_lvl)
+ extra_info.append(extra_info_per_lvl)
+
+ return pred_instances, extra_info
+
+ def forward_for_single_feature_map(self, logits, box2d_reg, centerness, locations, image_sizes):
+ N, C, _, __ = logits.shape
+
+ # put in the same format as locations
+ scores = logits.permute(0, 2, 3, 1).reshape(N, -1, C).sigmoid()
+ box2d_reg = box2d_reg.permute(0, 2, 3, 1).reshape(N, -1, 4)
+ centerness = centerness.permute(0, 2, 3, 1).reshape(N, -1).sigmoid()
+
+ # if self.thresh_with_ctr is True, we multiply the classification
+ # scores with centerness scores before applying the threshold.
+ if self.thresh_with_ctr:
+ scores = scores * centerness[:, :, None]
+
+ candidate_mask = scores > self.pre_nms_thresh
+
+ pre_nms_topk = candidate_mask.reshape(N, -1).sum(1)
+ pre_nms_topk = pre_nms_topk.clamp(max=self.pre_nms_topk)
+
+ if not self.thresh_with_ctr:
+ scores = scores * centerness[:, :, None]
+
+ results = []
+ all_fg_inds_per_im, all_topk_indices, all_class_inds_per_im = [], [], []
+ for i in range(N):
+ scores_per_im = scores[i]
+ candidate_mask_per_im = candidate_mask[i]
+ scores_per_im = scores_per_im[candidate_mask_per_im]
+
+ candidate_inds_per_im = candidate_mask_per_im.nonzero(as_tuple=False)
+ fg_inds_per_im = candidate_inds_per_im[:, 0]
+ class_inds_per_im = candidate_inds_per_im[:, 1]
+
+ # Cache info here.
+ all_fg_inds_per_im.append(fg_inds_per_im)
+ all_class_inds_per_im.append(class_inds_per_im)
+
+ box2d_reg_per_im = box2d_reg[i][fg_inds_per_im]
+ locations_per_im = locations[fg_inds_per_im]
+
+ pre_nms_topk_per_im = pre_nms_topk[i]
+
+ if candidate_mask_per_im.sum().item() > pre_nms_topk_per_im.item():
+ scores_per_im, topk_indices = \
+ scores_per_im.topk(pre_nms_topk_per_im, sorted=False)
+
+ class_inds_per_im = class_inds_per_im[topk_indices]
+ box2d_reg_per_im = box2d_reg_per_im[topk_indices]
+ locations_per_im = locations_per_im[topk_indices]
+ else:
+ topk_indices = None
+
+ all_topk_indices.append(topk_indices)
+
+ detections = torch.stack([
+ locations_per_im[:, 0] - box2d_reg_per_im[:, 0],
+ locations_per_im[:, 1] - box2d_reg_per_im[:, 1],
+ locations_per_im[:, 0] + box2d_reg_per_im[:, 2],
+ locations_per_im[:, 1] + box2d_reg_per_im[:, 3],
+ ],
+ dim=1)
+
+ instances = Instances(image_sizes[i])
+ instances.pred_boxes = Boxes(detections)
+ instances.scores = torch.sqrt(scores_per_im)
+ instances.pred_classes = class_inds_per_im
+ instances.locations = locations_per_im
+
+ results.append(instances)
+
+ extra_info = {
+ "fg_inds_per_im": all_fg_inds_per_im,
+ "class_inds_per_im": all_class_inds_per_im,
+ "topk_indices": all_topk_indices
+ }
+ return results, extra_info
+
+ def nms_and_top_k(self, instances_per_im, score_key_for_nms="scores"):
+ results = []
+ for instances in instances_per_im:
+ if self.nms_thresh > 0:
+ # Multiclass NMS.
+ keep = batched_nms(
+ instances.pred_boxes.tensor, instances.get(score_key_for_nms), instances.pred_classes,
+ self.nms_thresh
+ )
+ instances = instances[keep]
+ num_detections = len(instances)
+
+ # Limit to max_per_image detections **over all classes**
+ if num_detections > self.post_nms_topk > 0:
+ scores = instances.scores
+ # image_thresh, _ = torch.kthvalue(scores.cpu(), num_detections - self.post_nms_topk + 1)
+ image_thresh, _ = torch.kthvalue(scores, num_detections - self.post_nms_topk + 1)
+ keep = scores >= image_thresh.item()
+ keep = torch.nonzero(keep).squeeze(1)
+ instances = instances[keep]
+ results.append(instances)
+ return results
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos3d.py
new file mode 100644
index 0000000..f0669a6
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos3d.py
@@ -0,0 +1,427 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from mmcv.layers import Conv2d, batched_nms, cat, get_norm
+from mmcv.utils import force_fp32
+
+from .disentangled_box3d_loss import DisentangledBox3DLoss
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Offset, Scale
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import allocentric_to_egocentric, unproject_points2d
+
+EPS = 1e-7
+
+
+def predictions_to_boxes3d(
+ quat,
+ proj_ctr,
+ depth,
+ size,
+ locations,
+ inv_intrinsics,
+ canon_box_sizes,
+ min_depth,
+ max_depth,
+ scale_depth_by_focal_lengths_factor,
+ scale_depth_by_focal_lengths=True,
+ quat_is_allocentric=True,
+ depth_is_distance=False
+):
+ # Normalize to make quat unit norm.
+ quat = quat / quat.norm(dim=1, keepdim=True).clamp(min=EPS)
+ # Make sure again it's numerically unit-norm.
+ quat = quat / quat.norm(dim=1, keepdim=True)
+
+ if scale_depth_by_focal_lengths:
+ pixel_size = torch.norm(torch.stack([inv_intrinsics[:, 0, 0], inv_intrinsics[:, 1, 1]], dim=-1), dim=-1)
+ depth = depth / (pixel_size * scale_depth_by_focal_lengths_factor)
+
+ if depth_is_distance:
+ depth = depth / unproject_points2d(locations, inv_intrinsics).norm(dim=1).clamp(min=EPS)
+
+ depth = depth.reshape(-1, 1).clamp(min_depth, max_depth)
+
+ proj_ctr = proj_ctr + locations
+
+ if quat_is_allocentric:
+ quat = allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics)
+
+ size = (size.tanh() + 1.) * canon_box_sizes # max size = 2 * canon_size
+
+ return Boxes3D(quat, proj_ctr, depth, size, inv_intrinsics)
+
+
+class FCOS3DHead(nn.Module):
+ def __init__(self,
+ num_classes,
+ input_shape,
+ num_convs=4,
+ norm='BN',
+ use_scale=True,
+ depth_scale_init_factor=0.3,
+ proj_ctr_scale_init_factor=1.0,
+ use_per_level_predictors=False,
+ class_agnostic=False,
+ use_deformable=False,
+ mean_depth_per_level=None,
+ std_depth_per_level=None,
+ ):
+ super().__init__()
+ self.num_classes = num_classes
+ self.in_strides = [shape.stride for shape in input_shape]
+ self.num_levels = len(input_shape)
+
+ self.use_scale = use_scale
+ self.depth_scale_init_factor = depth_scale_init_factor
+ self.proj_ctr_scale_init_factor = proj_ctr_scale_init_factor
+ self.use_per_level_predictors = use_per_level_predictors
+
+ self.register_buffer("mean_depth_per_level", torch.Tensor(mean_depth_per_level))
+ self.register_buffer("std_depth_per_level", torch.Tensor(std_depth_per_level))
+
+ in_channels = [s.channels for s in input_shape]
+ assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+ in_channels = in_channels[0]
+
+ if use_deformable:
+ raise ValueError("Not supported yet.")
+
+ box3d_tower = []
+ for i in range(num_convs):
+ if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+ # NOTE: need to add norm here!
+ # Each FPN level has its own batchnorm layer.
+ # NOTE: do not use dd3d train.py!
+ # "BN" is converted to "SyncBN" in distributed training (see train.py)
+ norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+ else:
+ norm_layer = get_norm(norm, in_channels)
+ box3d_tower.append(
+ Conv2d(
+ in_channels,
+ in_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=norm_layer is None,
+ norm=norm_layer,
+ activation=F.relu
+ )
+ )
+ self.add_module('box3d_tower', nn.Sequential(*box3d_tower))
+
+ num_classes = self.num_classes if not class_agnostic else 1
+ num_levels = self.num_levels if use_per_level_predictors else 1
+
+ # 3D box branches.
+ self.box3d_quat = nn.ModuleList([
+ Conv2d(in_channels, 4 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+ for _ in range(num_levels)
+ ])
+ self.box3d_ctr = nn.ModuleList([
+ Conv2d(in_channels, 2 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+ for _ in range(num_levels)
+ ])
+ self.box3d_depth = nn.ModuleList([
+ Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=(not self.use_scale))
+ for _ in range(num_levels)
+ ])
+ self.box3d_size = nn.ModuleList([
+ Conv2d(in_channels, 3 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+ for _ in range(num_levels)
+ ])
+ self.box3d_conf = nn.ModuleList([
+ Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+ for _ in range(num_levels)
+ ])
+
+ if self.use_scale:
+ self.scales_proj_ctr = nn.ModuleList([
+ Scale(init_value=stride * self.proj_ctr_scale_init_factor) for stride in self.in_strides
+ ])
+ # (pre-)compute (mean, std) of depth for each level, and determine the init value here.
+ self.scales_size = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+ self.scales_conf = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+
+ self.scales_depth = nn.ModuleList([
+ Scale(init_value=sigma * self.depth_scale_init_factor) for sigma in self.std_depth_per_level
+ ])
+ self.offsets_depth = nn.ModuleList([Offset(init_value=b) for b in self.mean_depth_per_level])
+
+ self._init_weights()
+
+ def _init_weights(self):
+
+ for l in self.box3d_tower.modules():
+ if isinstance(l, nn.Conv2d):
+ torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+ if l.bias is not None:
+ torch.nn.init.constant_(l.bias, 0)
+
+ predictors = [self.box3d_quat, self.box3d_ctr, self.box3d_depth, self.box3d_size, self.box3d_conf]
+
+ for modules in predictors:
+ for l in modules.modules():
+ if isinstance(l, nn.Conv2d):
+ torch.nn.init.kaiming_uniform_(l.weight, a=1)
+ if l.bias is not None: # depth head may not have bias.
+ torch.nn.init.constant_(l.bias, 0)
+
+ def forward(self, x):
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf = [], [], [], [], []
+ dense_depth = None
+ for l, features in enumerate(x):
+ box3d_tower_out = self.box3d_tower(features)
+
+ _l = l if self.use_per_level_predictors else 0
+
+ # 3D box
+ quat = self.box3d_quat[_l](box3d_tower_out)
+ proj_ctr = self.box3d_ctr[_l](box3d_tower_out)
+ depth = self.box3d_depth[_l](box3d_tower_out)
+ size3d = self.box3d_size[_l](box3d_tower_out)
+ conf3d = self.box3d_conf[_l](box3d_tower_out)
+
+ if self.use_scale:
+ # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+ proj_ctr = self.scales_proj_ctr[l](proj_ctr)
+ size3d = self.scales_size[l](size3d)
+ conf3d = self.scales_conf[l](conf3d)
+ depth = self.offsets_depth[l](self.scales_depth[l](depth))
+
+ box3d_quat.append(quat)
+ box3d_ctr.append(proj_ctr)
+ box3d_depth.append(depth)
+ box3d_size.append(size3d)
+ box3d_conf.append(conf3d)
+
+ return box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth
+
+
+class FCOS3DLoss(nn.Module):
+ def __init__(self,
+ num_classes,
+ min_depth=0.1,
+ max_depth=80.0,
+ box3d_loss_weight=2.0,
+ conf3d_loss_weight=1.0,
+ conf_3d_temperature=1.0,
+ smooth_l1_loss_beta=0.05,
+ max_loss_per_group=20,
+ predict_allocentric_rot=True,
+ scale_depth_by_focal_lengths=True,
+ scale_depth_by_focal_lengths_factor=500.0,
+ class_agnostic=False,
+ predict_distance=False,
+ canon_box_sizes=None):
+ super().__init__()
+ self.canon_box_sizes = canon_box_sizes
+ self.min_depth = min_depth
+ self.max_depth = max_depth
+ self.predict_allocentric_rot = predict_allocentric_rot
+ self.scale_depth_by_focal_lengths = scale_depth_by_focal_lengths
+ self.scale_depth_by_focal_lengths_factor = scale_depth_by_focal_lengths_factor
+ self.predict_distance = predict_distance
+
+ self.box3d_reg_loss_fn = DisentangledBox3DLoss(smooth_l1_loss_beta, max_loss_per_group)
+ self.box3d_loss_weight = box3d_loss_weight
+ self.conf3d_loss_weight = conf3d_loss_weight
+ self.conf_3d_temperature = conf_3d_temperature
+
+ self.num_classes = num_classes
+ self.class_agnostic = class_agnostic
+
+ @force_fp32(apply_to=('box3d_quat', 'box3d_ctr', 'box3d_depth', 'box3d_size','box3d_conf', 'inv_intrinsics'))
+ def forward(
+ self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics, fcos2d_info,
+ targets
+ ):
+ labels = targets['labels']
+ box3d_targets = targets['box3d_targets']
+ pos_inds = targets["pos_inds"]
+
+ if pos_inds.numel() == 0:
+ losses = {
+ "loss_box3d_quat": torch.stack([x.sum() * 0. for x in box3d_quat]).sum(),
+ "loss_box3d_proj_ctr": torch.stack([x.sum() * 0. for x in box3d_ctr]).sum(),
+ "loss_box3d_depth": torch.stack([x.sum() * 0. for x in box3d_depth]).sum(),
+ "loss_box3d_size": torch.stack([x.sum() * 0. for x in box3d_size]).sum(),
+ "loss_conf3d": torch.stack([x.sum() * 0. for x in box3d_conf]).sum()
+ }
+ return losses
+
+ if len(labels) != len(box3d_targets):
+ raise ValueError(
+ f"The size of 'labels' and 'box3d_targets' does not match: a={len(labels)}, b={len(box3d_targets)}"
+ )
+
+ num_classes = self.num_classes if not self.class_agnostic else 1
+
+ box3d_quat_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4, num_classes) for x in box3d_quat])
+ box3d_ctr_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 2, num_classes) for x in box3d_ctr])
+ box3d_depth_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_depth])
+ box3d_size_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 3, num_classes) for x in box3d_size])
+ box3d_conf_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_conf])
+
+ # ----------------------
+ # 3D box disentangled loss
+ # ----------------------
+ box3d_targets = box3d_targets[pos_inds]
+
+ box3d_quat_pred = box3d_quat_pred[pos_inds]
+ box3d_ctr_pred = box3d_ctr_pred[pos_inds]
+ box3d_depth_pred = box3d_depth_pred[pos_inds]
+ box3d_size_pred = box3d_size_pred[pos_inds]
+ box3d_conf_pred = box3d_conf_pred[pos_inds]
+
+ if self.class_agnostic:
+ box3d_quat_pred = box3d_quat_pred.squeeze(-1)
+ box3d_ctr_pred = box3d_ctr_pred.squeeze(-1)
+ box3d_depth_pred = box3d_depth_pred.squeeze(-1)
+ box3d_size_pred = box3d_size_pred.squeeze(-1)
+ box3d_conf_pred = box3d_conf_pred.squeeze(-1)
+ else:
+ I = labels[pos_inds][..., None, None]
+ box3d_quat_pred = torch.gather(box3d_quat_pred, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+ box3d_ctr_pred = torch.gather(box3d_ctr_pred, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+ box3d_depth_pred = torch.gather(box3d_depth_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+ box3d_size_pred = torch.gather(box3d_size_pred, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+ box3d_conf_pred = torch.gather(box3d_conf_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+ canon_box_sizes = box3d_quat_pred.new_tensor(self.canon_box_sizes)[labels[pos_inds]]
+
+ locations = targets["locations"][pos_inds]
+ im_inds = targets["im_inds"][pos_inds]
+ inv_intrinsics = inv_intrinsics[im_inds]
+
+ box3d_pred = predictions_to_boxes3d(
+ box3d_quat_pred,
+ box3d_ctr_pred,
+ box3d_depth_pred,
+ box3d_size_pred,
+ locations,
+ inv_intrinsics,
+ canon_box_sizes,
+ self.min_depth,
+ self.max_depth,
+ scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+ scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+ quat_is_allocentric=self.predict_allocentric_rot,
+ depth_is_distance=self.predict_distance
+ )
+
+ centerness_targets = fcos2d_info["centerness_targets"]
+ loss_denom = fcos2d_info["loss_denom"]
+ losses_box3d, box3d_l1_error = self.box3d_reg_loss_fn(box3d_pred, box3d_targets, locations, centerness_targets)
+
+ losses_box3d = {k: self.box3d_loss_weight * v / loss_denom for k, v in losses_box3d.items()}
+
+ conf_3d_targets = torch.exp(-1. / self.conf_3d_temperature * box3d_l1_error)
+ loss_conf3d = F.binary_cross_entropy_with_logits(box3d_conf_pred, conf_3d_targets, reduction='none')
+ loss_conf3d = self.conf3d_loss_weight * (loss_conf3d * centerness_targets).sum() / loss_denom
+
+ losses = {"loss_conf3d": loss_conf3d, **losses_box3d}
+
+ return losses
+
+
+class FCOS3DInference():
+ def __init__(self, cfg):
+ self.canon_box_sizes = cfg.DD3D.FCOS3D.CANONICAL_BOX3D_SIZES
+ self.min_depth = cfg.DD3D.FCOS3D.MIN_DEPTH
+ self.max_depth = cfg.DD3D.FCOS3D.MAX_DEPTH
+ self.predict_allocentric_rot = cfg.DD3D.FCOS3D.PREDICT_ALLOCENTRIC_ROT
+ self.scale_depth_by_focal_lengths = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS
+ self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR
+ self.predict_distance = cfg.DD3D.FCOS3D.PREDICT_DISTANCE
+
+ self.num_classes = cfg.DD3D.NUM_CLASSES
+ self.class_agnostic = cfg.DD3D.FCOS3D.CLASS_AGNOSTIC_BOX3D
+
+ def __call__(
+ self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+ ):
+ # pred_instances: # List[List[Instances]], shape = (L, B)
+ for lvl, (box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl) in \
+ enumerate(zip(box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf)):
+
+ # In-place modification: update per-level pred_instances.
+ self.forward_for_single_feature_map(
+ box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl, inv_intrinsics,
+ pred_instances[lvl], fcos2d_info[lvl]
+ ) # List of Instances; one for each image.
+
+ def forward_for_single_feature_map(
+ self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+ ):
+ N = box3d_quat.shape[0]
+
+ num_classes = self.num_classes if not self.class_agnostic else 1
+
+ box3d_quat = box3d_quat.permute(0, 2, 3, 1).reshape(N, -1, 4, num_classes)
+ box3d_ctr = box3d_ctr.permute(0, 2, 3, 1).reshape(N, -1, 2, num_classes)
+ box3d_depth = box3d_depth.permute(0, 2, 3, 1).reshape(N, -1, num_classes)
+ box3d_size = box3d_size.permute(0, 2, 3, 1).reshape(N, -1, 3, num_classes)
+ box3d_conf = box3d_conf.permute(0, 2, 3, 1).reshape(N, -1, num_classes).sigmoid()
+
+ for i in range(N):
+ fg_inds_per_im = fcos2d_info['fg_inds_per_im'][i]
+ class_inds_per_im = fcos2d_info['class_inds_per_im'][i]
+ topk_indices = fcos2d_info['topk_indices'][i]
+
+ box3d_quat_per_im = box3d_quat[i][fg_inds_per_im]
+ box3d_ctr_per_im = box3d_ctr[i][fg_inds_per_im]
+ box3d_depth_per_im = box3d_depth[i][fg_inds_per_im]
+ box3d_size_per_im = box3d_size[i][fg_inds_per_im]
+ box3d_conf_per_im = box3d_conf[i][fg_inds_per_im]
+
+ if self.class_agnostic:
+ box3d_quat_per_im = box3d_quat_per_im.squeeze(-1)
+ box3d_ctr_per_im = box3d_ctr_per_im.squeeze(-1)
+ box3d_depth_per_im = box3d_depth_per_im.squeeze(-1)
+ box3d_size_per_im = box3d_size_per_im.squeeze(-1)
+ box3d_conf_per_im = box3d_conf_per_im.squeeze(-1)
+ else:
+ I = class_inds_per_im[..., None, None]
+ box3d_quat_per_im = torch.gather(box3d_quat_per_im, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+ box3d_ctr_per_im = torch.gather(box3d_ctr_per_im, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+ box3d_depth_per_im = torch.gather(box3d_depth_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+ box3d_size_per_im = torch.gather(box3d_size_per_im, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+ box3d_conf_per_im = torch.gather(box3d_conf_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+ if topk_indices is not None:
+ box3d_quat_per_im = box3d_quat_per_im[topk_indices]
+ box3d_ctr_per_im = box3d_ctr_per_im[topk_indices]
+ box3d_depth_per_im = box3d_depth_per_im[topk_indices]
+ box3d_size_per_im = box3d_size_per_im[topk_indices]
+ box3d_conf_per_im = box3d_conf_per_im[topk_indices]
+
+ # scores_per_im = pred_instances[i].scores.square()
+ # NOTE: Before refactoring, the squared score was used. Is raw 2D score better?
+ scores_per_im = pred_instances[i].scores
+ scores_3d_per_im = scores_per_im * box3d_conf_per_im
+
+ canon_box_sizes = box3d_quat.new_tensor(self.canon_box_sizes)[pred_instances[i].pred_classes]
+ inv_K = inv_intrinsics[i][None, ...].expand(len(box3d_quat_per_im), 3, 3)
+ locations = pred_instances[i].locations
+ pred_boxes3d = predictions_to_boxes3d(
+ box3d_quat_per_im,
+ box3d_ctr_per_im,
+ box3d_depth_per_im,
+ box3d_size_per_im,
+ locations,
+ inv_K,
+ canon_box_sizes,
+ self.min_depth,
+ self.max_depth,
+ scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+ scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+ quat_is_allocentric=self.predict_allocentric_rot,
+ depth_is_distance=self.predict_distance
+ )
+
+ # In-place modification: add fields to instances.
+ pred_instances[i].pred_boxes3d = pred_boxes3d
+ pred_instances[i].scores_3d = scores_3d_per_im
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
new file mode 100644
index 0000000..04a78d7
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
@@ -0,0 +1,525 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmcv.losses.fvcore_smooth_l1_loss import smooth_l1_loss
+from torch import nn
+
+from mmcv.structures import Instances
+from mmcv.models.builder import HEADS
+from mmcv.utils import force_fp32
+from torch import distributed as dist
+from mmcv.modeling.postprocessing import detector_postprocess as resize_instances
+from mmcv.layers import cat, Conv2d
+from adzoo.bevformer.mmdet3d_plugin.dd3d.datasets.nuscenes import MAX_NUM_ATTRIBUTES
+from .core import DD3D
+from .prepare_targets import DD3DTargetPreparer
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.image_list import ImageList
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000.
+
+def get_world_size() -> int:
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size()
+
+class NuscenesDD3DTargetPreparer(DD3DTargetPreparer):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ assert self.dd3d_enabled, f"{type(self).__name__} requires dd3d_enabled = True"
+
+ def __call__(self, locations, gt_instances, feature_shapes):
+ num_loc_list = [len(loc) for loc in locations]
+
+ # compute locations to size ranges
+ loc_to_size_range = []
+ for l, loc_per_level in enumerate(locations):
+ loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+ loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+ loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+ locations = torch.cat(locations, dim=0)
+
+ training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+ training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+ training_targets["im_inds"] = [
+ locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+ ]
+
+ box2d = training_targets.pop("box2d", None)
+
+ # transpose im first training_targets to level first ones
+ training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+ training_targets["fpn_levels"] = [
+ loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+ ]
+
+ # Flatten targets: (L x B x H x W, TARGET_SIZE)
+ labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+ box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+ target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+ locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+ im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+ fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+ pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+ targets = {
+ "labels": labels,
+ "box2d_reg_targets": box2d_reg_targets,
+ "locations": locations,
+ "target_inds": target_inds,
+ "im_inds": im_inds,
+ "fpn_levels": fpn_levels,
+ "pos_inds": pos_inds
+ }
+
+ if self.dd3d_enabled:
+ box3d_targets = Boxes3D.cat(training_targets["box3d"])
+ targets.update({"box3d_targets": box3d_targets})
+
+ if box2d is not None:
+ # Original format is B x L x (H x W, 4)
+ # Need to be in L x (B, 4, H, W).
+ batched_box2d = []
+ for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+ # B x (H x W, 4)
+ h, w = feature_shapes[lvl]
+ batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+ batched_box2d.append(batched_box2d_lvl)
+ targets.update({"batched_box2d": batched_box2d})
+
+ # Nuscenes targets -- attribute / speed
+ attributes = cat([x.reshape(-1) for x in training_targets["attributes"]])
+ speeds = cat([x.reshape(-1) for x in training_targets["speeds"]])
+
+ targets.update({'attributes': attributes, 'speeds': speeds})
+
+ return targets
+
+ def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+ labels = []
+ box2d_reg = []
+
+ if self.dd3d_enabled:
+ box3d = []
+
+ target_inds = []
+ xs, ys = locations[:, 0], locations[:, 1]
+
+ # NuScenes targets -- attribute / speed
+ attributes, speeds = [], []
+
+ num_targets = 0
+ for im_i in range(len(targets)):
+ targets_per_im = targets[im_i]
+ bboxes = targets_per_im.gt_boxes.tensor
+ labels_per_im = targets_per_im.gt_classes
+
+ # no gt
+ if bboxes.numel() == 0:
+ labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+ # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+ box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+ target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+ if self.dd3d_enabled:
+ box3d.append(
+ Boxes3D(
+ locations.new_zeros(locations.size(0), 4),
+ locations.new_zeros(locations.size(0), 2),
+ locations.new_zeros(locations.size(0), 1),
+ locations.new_zeros(locations.size(0), 3),
+ locations.new_zeros(locations.size(0), 3, 3),
+ ).to(torch.float32)
+ )
+ # NOTE: attributes and speeds.
+ attributes.append(labels_per_im.new_zeros(locations.size(0)))
+ speeds.append(labels_per_im.new_zeros(locations.size(0)))
+ continue
+
+ area = targets_per_im.gt_boxes.area()
+
+ l = xs[:, None] - bboxes[:, 0][None]
+ t = ys[:, None] - bboxes[:, 1][None]
+ r = bboxes[:, 2][None] - xs[:, None]
+ b = bboxes[:, 3][None] - ys[:, None]
+ # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+ box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+ if self.center_sample:
+ is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+ else:
+ is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+ max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+ # limit the regression range for each location
+ is_cared_in_the_level = \
+ (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+ (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+ locations_to_gt_area = area[None].repeat(len(locations), 1)
+ locations_to_gt_area[is_in_boxes == 0] = INF
+ locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+ # if there are still more than one objects for a location,
+ # we choose the one with minimal area
+ locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+ box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+ target_inds_per_im = locations_to_gt_inds + num_targets
+ num_targets += len(targets_per_im)
+
+ labels_per_im = labels_per_im[locations_to_gt_inds]
+ labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+ labels.append(labels_per_im)
+ box2d_reg.append(box2d_reg_per_im)
+ target_inds.append(target_inds_per_im)
+
+ if self.dd3d_enabled:
+ # 3D box targets
+ box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+ box3d.append(box3d_per_im)
+
+ # NuScenes targets -- attribute / speed
+ attributes_per_im = targets_per_im.gt_attributes[locations_to_gt_inds]
+ speeds_per_im = targets_per_im.gt_speeds[locations_to_gt_inds]
+ attributes.append(attributes_per_im)
+ speeds.append(speeds_per_im)
+
+ ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+ if self.dd3d_enabled:
+ ret.update({"box3d": box3d})
+
+ # NuScenes targets -- attribute / speed
+ ret.update({"attributes": attributes, "speeds": speeds})
+
+ return ret
+
+
+class NuscenesLoss(nn.Module):
+ def __init__(self, attr_loss_weight=0.2, speed_loss_weight=0.2):
+ super().__init__()
+ self.attr_loss_weight = attr_loss_weight
+ self.speed_loss_weight = speed_loss_weight
+
+ @force_fp32(apply_to=('attr_logits', 'speeds'))
+ def forward(self, attr_logits, speeds, fcos2d_info, targets):
+ # Flatten predictions
+ attr_logits = cat([x.permute(0, 2, 3, 1).reshape(-1, MAX_NUM_ATTRIBUTES) for x in attr_logits])
+ speeds = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in speeds])
+
+ pos_inds = targets['pos_inds']
+
+ losses = {}
+
+ # 1. Attributes
+ attr_logits = attr_logits[pos_inds]
+ target_attr = targets['attributes'][pos_inds]
+ valid_attr_mask = target_attr != MAX_NUM_ATTRIBUTES # No attrs associated with class, or just attr missing.
+
+ if pos_inds.numel() == 0:
+ attr_weights = attr_logits.new_tensor(0.0) #torch.tensor(0.0).cuda()
+ else:
+ attr_weights = fcos2d_info['centerness_targets'][valid_attr_mask]
+ # Denominator for all foreground losses -- re-computed for features with valid attributes.
+ # attr_loss_denom = max(reduce_sum(attr_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+ # NOTE: compute attr_weights_sum, and then feed it to reduce_sum() works, but not above.
+ attr_weights_sum = attr_weights.sum()
+ attr_loss_denom = max(reduce_sum(attr_weights_sum).item() / get_world_size(), 1e-6)
+
+ if valid_attr_mask.sum() == 0:
+ losses.update({"loss_attr": attr_logits.sum() * 0.})
+ else:
+ attr_logits = attr_logits[valid_attr_mask]
+ target_attr = target_attr[valid_attr_mask]
+
+ xent = F.cross_entropy(attr_logits, target_attr)
+ loss_attr = (xent * attr_weights).sum() / attr_loss_denom
+
+ losses.update({"loss_attr": self.attr_loss_weight * loss_attr})
+
+ # 2. Speed
+ speeds = speeds[pos_inds]
+ target_speeds = targets['speeds'][pos_inds]
+ # NOTE: some GT speeds are NaN.
+ valid_gt_mask = torch.logical_not(torch.isnan(target_speeds))
+
+ if pos_inds.numel() == 0:
+ speed_weights = speeds.new_tensor(0.0) #torch.tensor(0.0).cuda()
+ else:
+ speed_weights = fcos2d_info['centerness_targets'][valid_gt_mask]
+ # Denominator for all foreground losses -- re-computed for features with valid speeds.
+ # speed_loss_denom = max(reduce_sum(speed_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+ speed_weights_sum = speed_weights.sum()
+ speed_loss_denom = max(reduce_sum(speed_weights_sum).item() / get_world_size(), 1e-6)
+
+ # NOTE: move after reduce sum
+ if pos_inds.numel() == 0:
+ losses = {"loss_attr": attr_logits.sum() * 0., "loss_speed": speeds.sum() * 0.}
+ # NOTE: This is probably un-reachable, because the training filter images with empty annotations.
+ # NOTE: If not, attr_weights can be unavailable in the reduce_sum below().
+ return losses
+
+ if valid_gt_mask.sum() == 0:
+ losses.update({"loss_speed": speeds.sum() * 0.})
+ # return losses
+ else:
+ speeds = speeds[valid_gt_mask]
+ target_speeds = target_speeds[valid_gt_mask]
+
+ l1_error = smooth_l1_loss(speeds, target_speeds, beta=0.05)
+ loss_speed = (l1_error * speed_weights).sum() / speed_loss_denom
+ losses.update({"loss_speed": self.speed_loss_weight * loss_speed})
+
+ return losses
+
+
+class NuscenesInference():
+ def __init__(self, cfg):
+ pass
+
+ def __call__(self, attr_logits, speeds, pred_instances, fcos2d_info):
+ """Add 'pred_attribute', 'pred_speed' to Instances in 'pred_instances'."""
+ N = attr_logits[0].shape[0]
+ for lvl, (attr_logits_lvl, speed_lvl, info_lvl, instances_lvl) in \
+ enumerate(zip(attr_logits, speeds, fcos2d_info, pred_instances)):
+
+ attr_logits_lvl = attr_logits_lvl.permute(0, 2, 3, 1).reshape(N, -1, MAX_NUM_ATTRIBUTES)
+ speed_lvl = speed_lvl.permute(0, 2, 3, 1).reshape(N, -1)
+ for i in range(N):
+ fg_inds_per_im = info_lvl['fg_inds_per_im'][i]
+ topk_indices = info_lvl['topk_indices'][i]
+
+ attr_logits_per_im = attr_logits_lvl[i][fg_inds_per_im]
+ speed_per_im = speed_lvl[i][fg_inds_per_im]
+
+ if topk_indices is not None:
+ attr_logits_per_im = attr_logits_per_im[topk_indices]
+ speed_per_im = speed_per_im[topk_indices]
+
+ if len(attr_logits_per_im) == 0:
+ instances_lvl[i].pred_attributes = instances_lvl[i].pred_classes.new_tensor([])
+ instances_lvl[i].pred_speeds = instances_lvl[i].scores.new_tensor([])
+ else:
+ instances_lvl[i].pred_attributes = attr_logits_per_im.argmax(dim=1)
+ instances_lvl[i].pred_speeds = speed_per_im
+
+
+@HEADS.register_module()
+class NuscenesDD3D(DD3D):
+ def __init__(self,
+ num_classes,
+ in_channels,
+ strides,
+ fcos2d_cfg=dict(),
+ fcos2d_loss_cfg=dict(),
+ fcos3d_cfg=dict(),
+ fcos3d_loss_cfg=dict(),
+ target_assign_cfg=dict(),
+ nusc_loss_weight=dict(),
+ box3d_on=True,
+ feature_locations_offset="none"):
+ super().__init__(num_classes,
+ in_channels,
+ strides,
+ fcos2d_cfg=fcos2d_cfg,
+ fcos2d_loss_cfg=fcos2d_loss_cfg,
+ fcos3d_cfg=fcos3d_cfg,
+ fcos3d_loss_cfg=fcos3d_loss_cfg,
+ target_assign_cfg=target_assign_cfg,
+ box3d_on=box3d_on,
+ feature_locations_offset=feature_locations_offset)
+
+ # backbone_output_shape = self.backbone_output_shape
+ # in_channels = backbone_output_shape[0].channels
+
+ # --------------------------------------------------------------------------
+ # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+ # --------------------------------------------------------------------------
+ self.attr_logits = Conv2d(in_channels, MAX_NUM_ATTRIBUTES, kernel_size=3, stride=1, padding=1, bias=True)
+ self.speed = Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=True, activation=F.relu)
+
+ # init weights
+ for modules in [self.attr_logits, self.speed]:
+ for l in modules.modules():
+ if isinstance(l, nn.Conv2d):
+ torch.nn.init.kaiming_uniform_(l.weight, a=1)
+ if l.bias is not None: # depth head may not have bias.
+ torch.nn.init.constant_(l.bias, 0)
+
+ # Re-define target preparer
+ del self.prepare_targets
+ self.prepare_targets = NuscenesDD3DTargetPreparer(num_classes=num_classes,
+ input_shape=self.backbone_output_shape,
+ box3d_on=box3d_on,
+ **target_assign_cfg)
+
+ self.nuscenes_loss = NuscenesLoss(**nusc_loss_weight)
+ # NOTE: inference later
+ # self.nuscenes_inference = NuscenesInference(cfg)
+
+ # self.num_images_per_sample = cfg.MODEL.FCOS3D.NUSC_NUM_IMAGES_PER_SAMPLE
+ # NOTE: inference later
+ # self.num_images_per_sample = cfg.DD3D.NUSC.INFERENCE.NUM_IMAGES_PER_SAMPLE
+
+ # assert self.num_images_per_sample == 6
+ # assert cfg.DATALOADER.TEST.NUM_IMAGES_PER_GROUP == 6
+
+ # NOTE: NuScenes evaluator allows max. 500 detections per sample.
+ # self.max_num_dets_per_sample = cfg.DD3D.NUSC.INFERENCE.MAX_NUM_DETS_PER_SAMPLE
+
+ @force_fp32(apply_to=('features'))
+ def forward(self, features, batched_inputs):
+ # NOTE:
+ # images = [x["image"].to(self.device) for x in batched_inputs]
+ # images = [self.preprocess_image(x) for x in images]
+
+ # NOTE: directly use inv_intrinsics
+ # if 'intrinsics' in batched_inputs[0]:
+ # intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+ # else:
+ # intrinsics = None
+ # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+ if 'inv_intrinsics' in batched_inputs[0]:
+ inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+ inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+ else:
+ inv_intrinsics = None
+
+ # NOTE:
+ # gt_dense_depth = None
+ # if 'depth' in batched_inputs[0]:
+ # gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+ # gt_dense_depth = ImageList.from_tensors(
+ # gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+ # )
+
+ # NOTE: directly input feature
+ # features = self.backbone(images.tensor)
+ # features = [features[f] for f in self.in_features]
+
+ if "instances" in batched_inputs[0]:
+ gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+ else:
+ gt_instances = None
+
+ locations = self.compute_locations(features)
+ logits, box2d_reg, centerness, fcos2d_extra_output = self.fcos2d_head(features)
+ if not self.only_box2d:
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+ # NOTE: directly use inv_intrinsics
+ # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+ # --------------------------------------------------------------------------
+ # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+ # --------------------------------------------------------------------------
+ attr_logits, speeds = [], []
+ for x in fcos2d_extra_output['cls_tower_out']:
+ attr_logits.append(self.attr_logits(x))
+ speeds.append(self.speed(x))
+
+ if self.training:
+ assert gt_instances is not None
+ feature_shapes = [x.shape[-2:] for x in features]
+ training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+ # NOTE:
+ # if gt_dense_depth is not None:
+ # training_targets.update({"dense_depth": gt_dense_depth})
+
+ losses = {}
+ fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+ losses.update(fcos2d_loss)
+
+ if not self.only_box2d:
+ fcos3d_loss = self.fcos3d_loss(
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+ fcos2d_info, training_targets
+ )
+ losses.update(fcos3d_loss)
+
+ # Nuscenes loss -- attribute / speed
+ nuscenes_loss = self.nuscenes_loss(attr_logits, speeds, fcos2d_info, training_targets)
+ losses.update(nuscenes_loss)
+ return losses
+ else:
+ # TODO: do not support inference now
+ raise NotImplementedError
+ pred_instances, fcos2d_info = self.fcos2d_inference(
+ logits, box2d_reg, centerness, locations, images.image_sizes
+ )
+ if not self.only_box2d:
+ # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances'.
+ self.fcos3d_inference(
+ box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+ fcos2d_info
+ )
+ score_key = "scores_3d"
+ else:
+ score_key = "scores"
+
+ # This adds 'pred_attributes', 'pred_speed' to Instances in 'pred_instances'.
+ self.nuscenes_inference(attr_logits, speeds, pred_instances, fcos2d_info)
+
+ # Transpose to "image-first", i.e. (B, L)
+ pred_instances = list(zip(*pred_instances))
+ pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+ # 2D NMS and pick top-K.
+ if self.do_nms:
+ pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+ if not self.only_box2d and self.do_bev_nms:
+ # Bird-eye-view NMS.
+ dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+ if 'pose' in batched_inputs[0]:
+ poses = [x['pose'] for x in batched_inputs]
+ else:
+ poses = [x['extrinsics'] for x in batched_inputs]
+ pred_instances = nuscenes_sample_aggregate(
+ pred_instances,
+ dummy_group_idxs,
+ self.num_classes,
+ poses,
+ iou_threshold=self.bev_nms_iou_thresh,
+ include_boxes3d_global=False
+ )
+
+ if self.postprocess_in_inference:
+ processed_results = []
+ for results_per_image, input_per_image, image_size in \
+ zip(pred_instances, batched_inputs, images.image_sizes):
+ height = input_per_image.get("height", image_size[0])
+ width = input_per_image.get("width", image_size[1])
+ r = resize_instances(results_per_image, height, width)
+ processed_results.append({"instances": r})
+
+ # ----------------------------------------------------------
+ # NuScenes specific: cross-image (i.e. sample-level) BEV NMS.
+ # ----------------------------------------------------------
+ sample_tokens = [x['sample_token'] for x in batched_inputs]
+ group_idxs = get_group_idxs(sample_tokens, self.num_images_per_sample)
+
+ instances = [x['instances'] for x in processed_results]
+ global_poses = [x['pose'] for x in batched_inputs]
+
+ filtered_instances = nuscenes_sample_aggregate(
+ instances,
+ group_idxs,
+ self.num_classes,
+ global_poses,
+ self.bev_nms_iou_thresh,
+ max_num_dets_per_sample=self.max_num_dets_per_sample
+ )
+ processed_results = [{"instances": x} for x in filtered_instances]
+ else:
+ processed_results = [{"instances": x} for x in pred_instances]
+
+ return processed_results
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/prepare_targets.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/prepare_targets.py
new file mode 100644
index 0000000..91f76b5
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/prepare_targets.py
@@ -0,0 +1,242 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import torch
+
+from mmcv.layers import cat
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+INF = 100000000.
+
+
+class DD3DTargetPreparer():
+ def __init__(self,
+ num_classes,
+ input_shape,
+ box3d_on=True,
+ center_sample=True,
+ pos_radius=1.5,
+ sizes_of_interest=None):
+ self.num_classes = num_classes
+ self.center_sample = center_sample
+ self.strides = [shape.stride for shape in input_shape]
+ self.radius = pos_radius
+ self.dd3d_enabled = box3d_on
+
+ # generate sizes of interest
+ # NOTE:
+ # soi = []
+ # prev_size = -1
+ # for s in sizes_of_interest:
+ # soi.append([prev_size, s])
+ # prev_size = s
+ # soi.append([prev_size, INF])
+ self.sizes_of_interest = sizes_of_interest
+
+ def __call__(self, locations, gt_instances, feature_shapes):
+ num_loc_list = [len(loc) for loc in locations]
+
+ # compute locations to size ranges
+ loc_to_size_range = []
+ for l, loc_per_level in enumerate(locations):
+ loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+ loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+ loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+ locations = torch.cat(locations, dim=0)
+
+ training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+ training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+ training_targets["im_inds"] = [
+ locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+ ]
+
+ box2d = training_targets.pop("box2d", None)
+
+ # transpose im first training_targets to level first ones
+ training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+ training_targets["fpn_levels"] = [
+ loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+ ]
+
+ # Flatten targets: (L x B x H x W, TARGET_SIZE)
+ labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+ box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+ target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+ locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+ im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+ fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+ pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+ targets = {
+ "labels": labels,
+ "box2d_reg_targets": box2d_reg_targets,
+ "locations": locations,
+ "target_inds": target_inds,
+ "im_inds": im_inds,
+ "fpn_levels": fpn_levels,
+ "pos_inds": pos_inds
+ }
+
+ if self.dd3d_enabled:
+ box3d_targets = Boxes3D.cat(training_targets["box3d"])
+ targets.update({"box3d_targets": box3d_targets})
+
+ if box2d is not None:
+ # Original format is B x L x (H x W, 4)
+ # Need to be in L x (B, 4, H, W).
+ batched_box2d = []
+ for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+ # B x (H x W, 4)
+ h, w = feature_shapes[lvl]
+ batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+ batched_box2d.append(batched_box2d_lvl)
+ targets.update({"batched_box2d": batched_box2d})
+
+ return targets
+
+ def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+ labels = []
+ box2d_reg = []
+
+ if self.dd3d_enabled:
+ box3d = []
+
+ target_inds = []
+ xs, ys = locations[:, 0], locations[:, 1]
+
+ num_targets = 0
+ for im_i in range(len(targets)):
+ targets_per_im = targets[im_i]
+ bboxes = targets_per_im.gt_boxes.tensor
+ labels_per_im = targets_per_im.gt_classes
+
+ # no gt
+ if bboxes.numel() == 0:
+ labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+ # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+ box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+ target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+ if self.dd3d_enabled:
+ box3d.append(
+ Boxes3D(
+ locations.new_zeros(locations.size(0), 4),
+ locations.new_zeros(locations.size(0), 2),
+ locations.new_zeros(locations.size(0), 1),
+ locations.new_zeros(locations.size(0), 3),
+ locations.new_zeros(locations.size(0), 3, 3),
+ ).to(torch.float32)
+ )
+ continue
+
+ area = targets_per_im.gt_boxes.area()
+
+ l = xs[:, None] - bboxes[:, 0][None]
+ t = ys[:, None] - bboxes[:, 1][None]
+ r = bboxes[:, 2][None] - xs[:, None]
+ b = bboxes[:, 3][None] - ys[:, None]
+ # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+ box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+ if self.center_sample:
+ is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+ else:
+ is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+ max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+ # limit the regression range for each location
+ is_cared_in_the_level = \
+ (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+ (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+ locations_to_gt_area = area[None].repeat(len(locations), 1)
+ locations_to_gt_area[is_in_boxes == 0] = INF
+ locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+ # if there are still more than one objects for a location,
+ # we choose the one with minimal area
+ locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+ box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+ target_inds_per_im = locations_to_gt_inds + num_targets
+ num_targets += len(targets_per_im)
+
+ labels_per_im = labels_per_im[locations_to_gt_inds]
+ labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+ labels.append(labels_per_im)
+ box2d_reg.append(box2d_reg_per_im)
+ target_inds.append(target_inds_per_im)
+
+ if self.dd3d_enabled:
+ # 3D box targets
+ box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+ box3d.append(box3d_per_im)
+
+ ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+ if self.dd3d_enabled:
+ ret.update({"box3d": box3d})
+
+ return ret
+
+ def get_sample_region(self, boxes, num_loc_list, loc_xs, loc_ys):
+ center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5
+ center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
+
+ num_gts = boxes.shape[0]
+ K = len(loc_xs)
+ boxes = boxes[None].expand(K, num_gts, 4)
+ center_x = center_x[None].expand(K, num_gts)
+ center_y = center_y[None].expand(K, num_gts)
+ center_gt = boxes.new_zeros(boxes.shape)
+ # no gt
+ if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
+ return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
+ beg = 0
+ for level, num_loc in enumerate(num_loc_list):
+ end = beg + num_loc
+ stride = self.strides[level] * self.radius
+ xmin = center_x[beg:end] - stride
+ ymin = center_y[beg:end] - stride
+ xmax = center_x[beg:end] + stride
+ ymax = center_y[beg:end] + stride
+ # limit sample region in gt
+ center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
+ center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
+ center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
+ center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
+ beg = end
+ left = loc_xs[:, None] - center_gt[..., 0]
+ right = center_gt[..., 2] - loc_xs[:, None]
+ top = loc_ys[:, None] - center_gt[..., 1]
+ bottom = center_gt[..., 3] - loc_ys[:, None]
+ center_bbox = torch.stack((left, top, right, bottom), -1)
+ inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+ return inside_gt_bbox_mask
+
+ def _transpose(self, training_targets, num_loc_list):
+ '''
+ This function is used to transpose image first training targets to level first ones
+ :return: level first training targets
+ '''
+ if isinstance(training_targets[0], Boxes3D):
+ for im_i in range(len(training_targets)):
+ # training_targets[im_i] = torch.split(training_targets[im_i], num_loc_list, dim=0)
+ training_targets[im_i] = training_targets[im_i].split(num_loc_list, dim=0)
+
+ targets_level_first = []
+ for targets_per_level in zip(*training_targets):
+ targets_level_first.append(Boxes3D.cat(targets_per_level, dim=0))
+ return targets_level_first
+
+ for im_i in range(len(training_targets)):
+ training_targets[im_i] = torch.split(training_targets[im_i], num_loc_list, dim=0)
+
+ targets_level_first = []
+ for targets_per_level in zip(*training_targets):
+ targets_level_first.append(torch.cat(targets_per_level, dim=0))
+ return targets_level_first
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/__init__.py
new file mode 100644
index 0000000..3857649
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+from .image_list import ImageList
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/boxes3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/boxes3d.py
new file mode 100644
index 0000000..0823602
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/boxes3d.py
@@ -0,0 +1,321 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import numpy as np
+import torch
+from pyquaternion import Quaternion
+from torch.cuda import amp
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import unproject_points2d
+import adzoo.bevformer.mmdet3d_plugin.dd3d.structures.transform3d as t3d
+# yapf: disable
+BOX3D_CORNER_MAPPING = [
+ [1, 1, 1, 1, -1, -1, -1, -1],
+ [1, -1, -1, 1, 1, -1, -1, 1],
+ [1, 1, -1, -1, 1, 1, -1, -1]
+]
+# yapf: enable
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+ """
+ Convert rotations given as quaternions to rotation matrices.
+
+ Args:
+ quaternions: quaternions with real part first,
+ as tensor of shape (..., 4).
+
+ Returns:
+ Rotation matrices as tensor of shape (..., 3, 3).
+ """
+ r, i, j, k = torch.unbind(quaternions, -1)
+ two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+ o = torch.stack(
+ (
+ 1 - two_s * (j * j + k * k),
+ two_s * (i * j - k * r),
+ two_s * (i * k + j * r),
+ two_s * (i * j + k * r),
+ 1 - two_s * (i * i + k * k),
+ two_s * (j * k - i * r),
+ two_s * (i * k - j * r),
+ two_s * (j * k + i * r),
+ 1 - two_s * (i * i + j * j),
+ ),
+ -1,
+ )
+ return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+def _to_tensor(x, dim):
+ if isinstance(x, torch.Tensor):
+ x = x.to(torch.float32)
+ elif isinstance(x, np.ndarray) or isinstance(x, list) or isinstance(x, tuple):
+ x = torch.tensor(x, dtype=torch.float32)
+ elif isinstance(x, Quaternion):
+ x = torch.tensor(x.elements, dtype=torch.float32)
+ else:
+ raise ValueError(f"Unsupported type: {type(x).__name__}")
+
+ if x.ndim == 1:
+ x = x.reshape(-1, dim)
+ elif x.ndim > 2:
+ raise ValueError(f"Invalid shape of input: {x.shape.__str__()}")
+ return x
+
+
+class GenericBoxes3D():
+ def __init__(self, quat, tvec, size):
+ self.quat = _to_tensor(quat, dim=4)
+ self._tvec = _to_tensor(tvec, dim=3)
+ self.size = _to_tensor(size, dim=3)
+
+ @property
+ def tvec(self):
+ return self._tvec
+
+ @property
+ @amp.autocast(enabled=False)
+ def corners(self):
+ allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+ torch.backends.cuda.matmul.allow_tf32 = False
+ torch.backends.cudnn.allow_tf32 = False
+
+ translation = t3d.Translate(self.tvec, device=self.device)
+
+ R = quaternion_to_matrix(self.quat)
+ rotation = t3d.Rotate(R=R.transpose(1, 2), device=self.device) # Need to transpose to make it work.
+
+ tfm = rotation.compose(translation)
+
+ _corners = 0.5 * self.quat.new_tensor(BOX3D_CORNER_MAPPING).T
+ # corners_in_obj_frame = self.size.unsqueeze(1) * _corners.unsqueeze(0)
+ lwh = self.size[:, [1, 0, 2]] # wlh -> lwh
+ corners_in_obj_frame = lwh.unsqueeze(1) * _corners.unsqueeze(0)
+
+ corners3d = tfm.transform_points(corners_in_obj_frame)
+ torch.backends.cuda.matmul.allow_tf32 = allow_tf32
+ torch.backends.cudnn.allow_tf32 = allow_tf32
+ return corners3d
+
+ @classmethod
+ def from_vectors(cls, vecs, device="cpu"):
+ """
+ Parameters
+ ----------
+ vecs: Iterable[np.ndarray]
+ Iterable of 10D pose representation.
+
+ intrinsics: np.ndarray
+ (3, 3) intrinsics matrix.
+ """
+ quats, tvecs, sizes = [], [], []
+ for vec in vecs:
+ quat = vec[:4]
+ tvec = vec[4:7]
+ size = vec[7:]
+
+ quats.append(quat)
+ tvecs.append(tvec)
+ sizes.append(size)
+
+ quats = torch.as_tensor(quats, dtype=torch.float32, device=device)
+ tvecs = torch.as_tensor(tvecs, dtype=torch.float32, device=device)
+ sizes = torch.as_tensor(sizes, device=device)
+
+ return cls(quats, tvecs, sizes)
+
+ @classmethod
+ def cat(cls, boxes_list, dim=0):
+
+ assert isinstance(boxes_list, (list, tuple))
+ if len(boxes_list) == 0:
+ return cls(torch.empty(0), torch.empty(0), torch.empty(0))
+ assert all([isinstance(box, GenericBoxes3D) for box in boxes_list])
+
+ # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+ quat = torch.cat([b.quat for b in boxes_list], dim=dim)
+ tvec = torch.cat([b.tvec for b in boxes_list], dim=dim)
+ size = torch.cat([b.size for b in boxes_list], dim=dim)
+
+ cat_boxes = cls(quat, tvec, size)
+ return cat_boxes
+
+ def split(self, split_sizes, dim=0):
+ assert sum(split_sizes) == len(self)
+ quat_list = torch.split(self.quat, split_sizes, dim=dim)
+ tvec_list = torch.split(self.tvec, split_sizes, dim=dim)
+ size_list = torch.split(self.size, split_sizes, dim=dim)
+
+ return [GenericBoxes3D(*x) for x in zip(quat_list, tvec_list, size_list)]
+
+ def __getitem__(self, item):
+ """
+ """
+ if isinstance(item, int):
+ return GenericBoxes3D(self.quat[item].view(1, -1), self.tvec[item].view(1, -1), self.size[item].view(1, -1))
+
+ quat = self.quat[item]
+ tvec = self.tvec[item]
+ size = self.size[item]
+
+ assert quat.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert tvec.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert size.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+
+ return GenericBoxes3D(quat, tvec, size)
+
+ def __len__(self):
+ assert len(self.quat) == len(self.tvec) == len(self.size)
+ return self.quat.shape[0]
+
+ def clone(self):
+ """
+ """
+ return GenericBoxes3D(self.quat.clone(), self.tvec.clone(), self.size.clone())
+
+ def vectorize(self):
+ xyz = self.tvec
+ return torch.cat([self.quat, xyz, self.size], dim=1)
+
+ @property
+ def device(self):
+ return self.quat.device
+
+ def to(self, *args, **kwargs):
+ quat = self.quat.to(*args, **kwargs)
+ tvec = self.tvec.to(*args, **kwargs)
+ size = self.size.to(*args, **kwargs)
+ return GenericBoxes3D(quat, tvec, size)
+
+
+class Boxes3D(GenericBoxes3D):
+ """Vision-based 3D box container.
+
+ The tvec is computed from projected center, depth, and intrinsics.
+ """
+ def __init__(self, quat, proj_ctr, depth, size, inv_intrinsics):
+ self.quat = quat
+ self.proj_ctr = proj_ctr
+ self.depth = depth
+ self.size = size
+ self.inv_intrinsics = inv_intrinsics
+
+ @property
+ def tvec(self):
+ ray = unproject_points2d(self.proj_ctr, self.inv_intrinsics)
+ xyz = ray * self.depth
+ return xyz
+
+ @classmethod
+ def from_vectors(cls, vecs, intrinsics, device="cpu"):
+ """
+ Parameters
+ ----------
+ vecs: Iterable[np.ndarray]
+ Iterable of 10D pose representation.
+
+ intrinsics: np.ndarray
+ (3, 3) intrinsics matrix.
+ """
+ if len(vecs) == 0:
+ quats = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 4)
+ proj_ctrs = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 2)
+ depths = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 1)
+ sizes = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 3)
+ inv_intrinsics = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 3, 3)
+ return cls(quats, proj_ctrs, depths, sizes, inv_intrinsics)
+
+ quats, proj_ctrs, depths, sizes = [], [], [], []
+ for vec in vecs:
+ quat = vec[:4]
+
+ proj_ctr = intrinsics.dot(vec[4:7])
+ proj_ctr = proj_ctr[:2] / proj_ctr[-1]
+
+ depth = vec[6:7]
+
+ size = vec[7:]
+
+ quats.append(quat)
+ proj_ctrs.append(proj_ctr)
+ depths.append(depth)
+ sizes.append(size)
+
+ quats = torch.as_tensor(np.array(quats), dtype=torch.float32, device=device)
+ proj_ctrs = torch.as_tensor(np.array(proj_ctrs), dtype=torch.float32, device=device)
+ depths = torch.as_tensor(np.array(depths), dtype=torch.float32, device=device)
+ sizes = torch.as_tensor(np.array(sizes), dtype=torch.float32, device=device)
+
+ inv_intrinsics = np.linalg.inv(intrinsics)
+ inv_intrinsics = torch.as_tensor(inv_intrinsics[None, ...], device=device).expand(len(vecs), 3, 3)
+
+ return cls(quats, proj_ctrs, depths, sizes, inv_intrinsics)
+
+ @classmethod
+ def cat(cls, boxes_list, dim=0):
+
+ assert isinstance(boxes_list, (list, tuple))
+ if len(boxes_list) == 0:
+ return cls(torch.empty(0), torch.empty(0), torch.empty(0), torch.empty(0), torch.empty(0))
+ assert all([isinstance(box, Boxes3D) for box in boxes_list])
+
+ # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+ quat = torch.cat([b.quat for b in boxes_list], dim=dim)
+ proj_ctr = torch.cat([b.proj_ctr for b in boxes_list], dim=dim)
+ depth = torch.cat([b.depth for b in boxes_list], dim=dim)
+ size = torch.cat([b.size for b in boxes_list], dim=dim)
+ inv_intrinsics = torch.cat([b.inv_intrinsics for b in boxes_list], dim=dim)
+
+ cat_boxes = cls(quat, proj_ctr, depth, size, inv_intrinsics)
+ return cat_boxes
+
+ def split(self, split_sizes, dim=0):
+ assert sum(split_sizes) == len(self)
+ quat_list = torch.split(self.quat, split_sizes, dim=dim)
+ proj_ctr_list = torch.split(self.proj_ctr, split_sizes, dim=dim)
+ depth_list = torch.split(self.depth, split_sizes, dim=dim)
+ size_list = torch.split(self.size, split_sizes, dim=dim)
+ inv_K_list = torch.split(self.inv_intrinsics, split_sizes, dim=dim)
+
+ return [Boxes3D(*x) for x in zip(quat_list, proj_ctr_list, depth_list, size_list, inv_K_list)]
+
+ def __getitem__(self, item):
+ """
+ """
+ if isinstance(item, int):
+ return Boxes3D(
+ self.quat[item].view(1, -1), self.proj_ctr[item].view(1, -1), self.depth[item].view(1, -1),
+ self.size[item].view(1, -1), self.inv_intrinsics[item].view(1, 3, 3)
+ )
+
+ quat = self.quat[item]
+ ctr = self.proj_ctr[item]
+ depth = self.depth[item]
+ size = self.size[item]
+ inv_K = self.inv_intrinsics[item]
+
+ assert quat.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert ctr.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert depth.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert size.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert inv_K.dim() == 3, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+ assert inv_K.shape[1:] == (3, 3), "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+
+ return Boxes3D(quat, ctr, depth, size, inv_K)
+
+ def __len__(self):
+ assert len(self.quat) == len(self.proj_ctr) == len(self.depth) == len(self.size) == len(self.inv_intrinsics)
+ return self.quat.shape[0]
+
+ def clone(self):
+ """
+ """
+ return Boxes3D(
+ self.quat.clone(), self.proj_ctr.clone(), self.depth.clone(), self.size.clone(), self.inv_intrinsics.clone()
+ )
+
+ def to(self, *args, **kwargs):
+ quat = self.quat.to(*args, **kwargs)
+ proj_ctr = self.proj_ctr.to(*args, **kwargs)
+ depth = self.depth.to(*args, **kwargs)
+ size = self.size.to(*args, **kwargs)
+ inv_K = self.inv_intrinsics.to(*args, **kwargs)
+ return Boxes3D(quat, proj_ctr, depth, size, inv_K)
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/image_list.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/image_list.py
new file mode 100644
index 0000000..f27b3c0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/image_list.py
@@ -0,0 +1,157 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+from __future__ import division
+
+from typing import Any, List, Sequence, Tuple
+
+import torch
+from torch import device
+from torch.nn import functional as F
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+def _as_tensor(x: Tuple[int, int]) -> torch.Tensor:
+ """
+ An equivalent of `torch.as_tensor`, but works under tracing if input
+ is a list of tensor. `torch.as_tensor` will record a constant in tracing,
+ but this function will use `torch.stack` instead.
+ """
+ if torch.jit.is_scripting():
+ return torch.as_tensor(x)
+ if isinstance(x, (list, tuple)) and all([isinstance(t, torch.Tensor) for t in x]):
+ return torch.stack(x)
+ return torch.as_tensor(x)
+
+
+class ImageList(object):
+ """
+ Adapted from detectron2:
+ https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/image_list.py)
+
+ Key differences:
+ - add optional intrinsics
+ - add optional image path (useful for debugging)
+ ==================================================================================================================
+
+ Structure that holds a list of images (of possibly
+ varying sizes) as a single tensor.
+ This works by padding the images to the same size,
+ and storing in a field the original sizes of each image
+
+ Attributes:
+ image_sizes (list[tuple[int, int]]): each tuple is (h, w)
+ """
+ def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]], intrinsics=None, image_paths=None):
+ """
+ Arguments:
+ tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
+ image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
+ be smaller than (H, W) due to padding.
+ """
+ self.tensor = tensor
+ self.image_sizes = image_sizes
+ self._intrinsics = intrinsics
+ self._image_paths = image_paths
+
+ @property
+ def intrinsics(self):
+ if torch.allclose(self._intrinsics[0], torch.eye(3, device=self._intrinsics.device)):
+ # TODO: torch.inverse(images.intrinsics) often return identity, when it shouldn't. Is it pytorch bug?
+ raise ValueError("Intrinsics is Identity.")
+ return self._intrinsics
+
+ @property
+ def image_paths(self):
+ return self._image_paths
+
+ def __len__(self) -> int:
+ return len(self.image_sizes)
+
+ def __getitem__(self, idx) -> torch.Tensor:
+ """
+ Access the individual image in its original size.
+
+ Args:
+ idx: int or slice
+
+ Returns:
+ Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
+ """
+ size = self.image_sizes[idx]
+ return self.tensor[idx, ..., :size[0], :size[1]]
+
+ @torch.jit.unused
+ def to(self, *args: Any, **kwargs: Any) -> "ImageList":
+ cast_tensor = self.tensor.to(*args, **kwargs)
+ return ImageList(cast_tensor, self.image_sizes, intrinsics=self.intrinsics)
+
+ @property
+ def device(self) -> device:
+ return self.tensor.device
+
+ @staticmethod
+ def from_tensors(
+ tensors: List[torch.Tensor],
+ size_divisibility: int = 0,
+ pad_value: float = 0.0,
+ intrinsics=None,
+ image_paths=None
+ ) -> "ImageList":
+ """
+ Args:
+ tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or
+ (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded
+ to the same shape with `pad_value`.
+ size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
+ the common height and width is divisible by `size_divisibility`.
+ This depends on the model and many models need a divisibility of 32.
+ pad_value (float): value to pad
+
+ Returns:
+ an `ImageList`.
+ """
+ assert len(tensors) > 0
+ assert isinstance(tensors, (tuple, list))
+ for t in tensors:
+ assert isinstance(t, torch.Tensor), type(t)
+ assert t.shape[:-2] == tensors[0].shape[:-2], t.shape
+
+ image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors]
+ image_sizes_tensor = [_as_tensor(x) for x in image_sizes]
+ max_size = torch.stack(image_sizes_tensor).max(0).values
+
+ if size_divisibility > 1:
+ stride = size_divisibility
+ # the last two dims are H,W, both subject to divisibility requirement
+ max_size = torch.div(max_size + (stride - 1), stride, rounding_mode='floor') * stride
+
+ # handle weirdness of scripting and tracing ...
+ if torch.jit.is_scripting():
+ max_size: List[int] = max_size.to(dtype=torch.long).tolist()
+ else:
+ # https://github.com/pytorch/pytorch/issues/42448
+ if TORCH_VERSION >= (1, 7) and torch.jit.is_tracing():
+ image_sizes = image_sizes_tensor
+
+ if len(tensors) == 1:
+ # This seems slightly (2%) faster.
+ # TODO: check whether it's faster for multiple images as well
+ image_size = image_sizes[0]
+ padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]]
+ batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0)
+ else:
+ # max_size can be a tensor in tracing mode, therefore convert to list
+ batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size)
+ batched_imgs = tensors[0].new_full(batch_shape, pad_value)
+ for img, pad_img in zip(tensors, batched_imgs):
+ pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)
+
+ if intrinsics is not None:
+ assert isinstance(intrinsics, (tuple, list))
+ assert len(intrinsics) == len(tensors)
+ intrinsics = torch.stack(intrinsics, dim=0)
+
+ if image_paths is not None:
+ assert len(image_paths) == len(tensors)
+
+ return ImageList(batched_imgs.contiguous(), image_sizes, intrinsics, image_paths)
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/pose.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/pose.py
new file mode 100644
index 0000000..2746f92
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/pose.py
@@ -0,0 +1,164 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import numpy as np
+from pyquaternion import Quaternion
+
+
+class Pose:
+ """SE(3) rigid transform class that allows compounding of 6-DOF poses
+ and provides common transformations that are commonly seen in geometric problems.
+ """
+ def __init__(self, wxyz=np.float32([1., 0., 0., 0.]), tvec=np.float32([0., 0., 0.])):
+ """Initialize a Pose with Quaternion and 3D Position
+
+ Parameters
+ ----------
+ wxyz: np.float32 or Quaternion (default: np.float32([1,0,0,0]))
+ Quaternion/Rotation (wxyz)
+
+ tvec: np.float32 (default: np.float32([0,0,0]))
+ Translation (xyz)
+ """
+ assert isinstance(wxyz, (np.ndarray, Quaternion))
+ assert isinstance(tvec, np.ndarray)
+
+ if isinstance(wxyz, np.ndarray):
+ assert np.abs(1.0 - np.linalg.norm(wxyz)) < 1.0e-3
+
+ self.quat = Quaternion(wxyz)
+ self.tvec = tvec
+
+ def __repr__(self):
+ formatter = {'float_kind': lambda x: '%.2f' % x}
+ tvec_str = np.array2string(self.tvec, formatter=formatter)
+ return 'wxyz: {}, tvec: ({})'.format(self.quat, tvec_str)
+
+ def copy(self):
+ """Return a copy of this pose object.
+
+ Returns
+ ----------
+ result: Pose
+ Copied pose object.
+ """
+ return self.__class__(Quaternion(self.quat), self.tvec.copy())
+
+ def __mul__(self, other):
+ """Left-multiply Pose with another Pose or 3D-Points.
+
+ Parameters
+ ----------
+ other: Pose or np.ndarray
+ 1. Pose: Identical to oplus operation.
+ (i.e. self_pose * other_pose)
+ 2. ndarray: transform [N x 3] point set
+ (i.e. X' = self_pose * X)
+
+ Returns
+ ----------
+ result: Pose or np.ndarray
+ Transformed pose or point cloud
+ """
+ if isinstance(other, Pose):
+ assert isinstance(other, self.__class__)
+ t = self.quat.rotate(other.tvec) + self.tvec
+ q = self.quat * other.quat
+ return self.__class__(q, t)
+ elif isinstance(other, np.ndarray):
+ assert other.shape[-1] == 3, 'Point cloud is not 3-dimensional'
+ X = np.hstack([other, np.ones((len(other), 1))]).T
+ return (np.dot(self.matrix, X).T)[:, :3]
+ else:
+ return NotImplemented
+
+ def __rmul__(self, other):
+ raise NotImplementedError('Right multiply not implemented yet!')
+
+ def inverse(self):
+ """Returns a new Pose that corresponds to the
+ inverse of this one.
+
+ Returns
+ ----------
+ result: Pose
+ Inverted pose
+ """
+ qinv = self.quat.inverse
+ return self.__class__(qinv, qinv.rotate(-self.tvec))
+
+ @property
+ def matrix(self):
+ """Returns a 4x4 homogeneous matrix of the form [R t; 0 1]
+
+ Returns
+ ----------
+ result: np.ndarray
+ 4x4 homogeneous matrix
+ """
+ result = self.quat.transformation_matrix
+ result[:3, 3] = self.tvec
+ return result
+
+ @property
+ def rotation_matrix(self):
+ """Returns the 3x3 rotation matrix (R)
+
+ Returns
+ ----------
+ result: np.ndarray
+ 3x3 rotation matrix
+ """
+ result = self.quat.transformation_matrix
+ return result[:3, :3]
+
+ @property
+ def rotation(self):
+ """Return the rotation component of the pose as a Quaternion object.
+
+ Returns
+ ----------
+ self.quat: Quaternion
+ Rotation component of the Pose object.
+ """
+ return self.quat
+
+ @property
+ def translation(self):
+ """Return the translation component of the pose as a np.ndarray.
+
+ Returns
+ ----------
+ self.tvec: np.ndarray
+ Translation component of the Pose object.
+ """
+ return self.tvec
+
+ @classmethod
+ def from_matrix(cls, transformation_matrix):
+ """Initialize pose from 4x4 transformation matrix
+
+ Parameters
+ ----------
+ transformation_matrix: np.ndarray
+ 4x4 containing rotation/translation
+
+ Returns
+ -------
+ Pose
+ """
+ return cls(wxyz=Quaternion(matrix=transformation_matrix[:3, :3]), tvec=np.float32(transformation_matrix[:3, 3]))
+
+ @classmethod
+ def from_rotation_translation(cls, rotation_matrix, tvec):
+ """Initialize pose from rotation matrix and translation vector.
+
+ Parameters
+ ----------
+ rotation_matrix : np.ndarray
+ 3x3 rotation matrix
+ tvec : np.ndarray
+ length-3 translation vector
+ """
+ return cls(wxyz=Quaternion(matrix=rotation_matrix), tvec=np.float64(tvec))
+
+ def __eq__(self, other):
+ return self.quat == other.quat and (self.tvec == other.tvec).all()
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/transform3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/transform3d.py
new file mode 100644
index 0000000..36133d0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/transform3d.py
@@ -0,0 +1,896 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+from typing import List, Optional, Union
+
+import torch
+
+Device = Union[str, torch.device]
+
+
+def make_device(device: Device) -> torch.device:
+ """
+ Makes an actual torch.device object from the device specified as
+ either a string or torch.device object. If the device is `cuda` without
+ a specific index, the index of the current device is assigned.
+
+ Args:
+ device: Device (as str or torch.device)
+
+ Returns:
+ A matching torch.device object
+ """
+ device = torch.device(device) if isinstance(device, str) else device
+ if device.type == "cuda" and device.index is None: # pyre-ignore[16]
+ # If cuda but with no index, then the current cuda device is indicated.
+ # In that case, we fix to that device
+ device = torch.device(f"cuda:{torch.cuda.current_device()}")
+ return device
+
+
+def get_device(x, device: Optional[Device] = None) -> torch.device:
+ """
+ Gets the device of the specified variable x if it is a tensor, or
+ falls back to a default CPU device otherwise. Allows overriding by
+ providing an explicit device.
+
+ Args:
+ x: a torch.Tensor to get the device from or another type
+ device: Device (as str or torch.device) to fall back to
+
+ Returns:
+ A matching torch.device object
+ """
+
+ # User overrides device
+ if device is not None:
+ return make_device(device)
+
+ # Set device based on input tensor
+ if torch.is_tensor(x):
+ return x.device
+
+ # Default device is cpu
+ return torch.device("cpu")
+
+
+def _safe_det_3x3(t: torch.Tensor):
+ """
+ Fast determinant calculation for a batch of 3x3 matrices.
+
+ Note, result of this function might not be the same as `torch.det()`.
+ The differences might be in the last significant digit.
+
+ Args:
+ t: Tensor of shape (N, 3, 3).
+
+ Returns:
+ Tensor of shape (N) with determinants.
+ """
+
+ det = (
+ t[..., 0, 0] * (t[..., 1, 1] * t[..., 2, 2] - t[..., 1, 2] * t[..., 2, 1])
+ - t[..., 0, 1] * (t[..., 1, 0] * t[..., 2, 2] - t[..., 2, 0] * t[..., 1, 2])
+ + t[..., 0, 2] * (t[..., 1, 0] * t[..., 2, 1] - t[..., 2, 0] * t[..., 1, 1])
+ )
+
+ return det
+
+def _axis_angle_rotation(axis: str, angle: torch.Tensor) -> torch.Tensor:
+ """
+ Return the rotation matrices for one of the rotations about an axis
+ of which Euler angles describe, for each value of the angle given.
+
+ Args:
+ axis: Axis label "X" or "Y or "Z".
+ angle: any shape tensor of Euler angles in radians
+
+ Returns:
+ Rotation matrices as tensor of shape (..., 3, 3).
+ """
+
+ cos = torch.cos(angle)
+ sin = torch.sin(angle)
+ one = torch.ones_like(angle)
+ zero = torch.zeros_like(angle)
+
+ if axis == "X":
+ R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+ elif axis == "Y":
+ R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+ elif axis == "Z":
+ R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+ else:
+ raise ValueError("letter must be either X, Y or Z.")
+
+ return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+
+class Transform3d:
+ """
+ A Transform3d object encapsulates a batch of N 3D transformations, and knows
+ how to transform points and normal vectors. Suppose that t is a Transform3d;
+ then we can do the following:
+
+ .. code-block:: python
+
+ N = len(t)
+ points = torch.randn(N, P, 3)
+ normals = torch.randn(N, P, 3)
+ points_transformed = t.transform_points(points) # => (N, P, 3)
+ normals_transformed = t.transform_normals(normals) # => (N, P, 3)
+
+
+ BROADCASTING
+ Transform3d objects supports broadcasting. Suppose that t1 and tN are
+ Transform3d objects with len(t1) == 1 and len(tN) == N respectively. Then we
+ can broadcast transforms like this:
+
+ .. code-block:: python
+
+ t1.transform_points(torch.randn(P, 3)) # => (P, 3)
+ t1.transform_points(torch.randn(1, P, 3)) # => (1, P, 3)
+ t1.transform_points(torch.randn(M, P, 3)) # => (M, P, 3)
+ tN.transform_points(torch.randn(P, 3)) # => (N, P, 3)
+ tN.transform_points(torch.randn(1, P, 3)) # => (N, P, 3)
+
+
+ COMBINING TRANSFORMS
+ Transform3d objects can be combined in two ways: composing and stacking.
+ Composing is function composition. Given Transform3d objects t1, t2, t3,
+ the following all compute the same thing:
+
+ .. code-block:: python
+
+ y1 = t3.transform_points(t2.transform_points(t1.transform_points(x)))
+ y2 = t1.compose(t2).compose(t3).transform_points(x)
+ y3 = t1.compose(t2, t3).transform_points(x)
+
+
+ Composing transforms should broadcast.
+
+ .. code-block:: python
+
+ if len(t1) == 1 and len(t2) == N, then len(t1.compose(t2)) == N.
+
+ We can also stack a sequence of Transform3d objects, which represents
+ composition along the batch dimension; then the following should compute the
+ same thing.
+
+ .. code-block:: python
+
+ N, M = len(tN), len(tM)
+ xN = torch.randn(N, P, 3)
+ xM = torch.randn(M, P, 3)
+ y1 = torch.cat([tN.transform_points(xN), tM.transform_points(xM)], dim=0)
+ y2 = tN.stack(tM).transform_points(torch.cat([xN, xM], dim=0))
+
+ BUILDING TRANSFORMS
+ We provide convenience methods for easily building Transform3d objects
+ as compositions of basic transforms.
+
+ .. code-block:: python
+
+ # Scale by 0.5, then translate by (1, 2, 3)
+ t1 = Transform3d().scale(0.5).translate(1, 2, 3)
+
+ # Scale each axis by a different amount, then translate, then scale
+ t2 = Transform3d().scale(1, 3, 3).translate(2, 3, 1).scale(2.0)
+
+ t3 = t1.compose(t2)
+ tN = t1.stack(t3, t3)
+
+
+ BACKPROP THROUGH TRANSFORMS
+ When building transforms, we can also parameterize them by Torch tensors;
+ in this case we can backprop through the construction and application of
+ Transform objects, so they could be learned via gradient descent or
+ predicted by a neural network.
+
+ .. code-block:: python
+
+ s1_params = torch.randn(N, requires_grad=True)
+ t_params = torch.randn(N, 3, requires_grad=True)
+ s2_params = torch.randn(N, 3, requires_grad=True)
+
+ t = Transform3d().scale(s1_params).translate(t_params).scale(s2_params)
+ x = torch.randn(N, 3)
+ y = t.transform_points(x)
+ loss = compute_loss(y)
+ loss.backward()
+
+ with torch.no_grad():
+ s1_params -= lr * s1_params.grad
+ t_params -= lr * t_params.grad
+ s2_params -= lr * s2_params.grad
+
+ CONVENTIONS
+ We adopt a right-hand coordinate system, meaning that rotation about an axis
+ with a positive angle results in a counter clockwise rotation.
+
+ This class assumes that transformations are applied on inputs which
+ are row vectors. The internal representation of the Nx4x4 transformation
+ matrix is of the form:
+
+ .. code-block:: python
+
+ M = [
+ [Rxx, Ryx, Rzx, 0],
+ [Rxy, Ryy, Rzy, 0],
+ [Rxz, Ryz, Rzz, 0],
+ [Tx, Ty, Tz, 1],
+ ]
+
+ To apply the transformation to points which are row vectors, the M matrix
+ can be pre multiplied by the points:
+
+ .. code-block:: python
+
+ points = [[0, 1, 2]] # (1 x 3) xyz coordinates of a point
+ transformed_points = points * M
+
+ """
+
+ def __init__(
+ self,
+ dtype: torch.dtype = torch.float32,
+ device: Device = "cpu",
+ matrix: Optional[torch.Tensor] = None,
+ ) -> None:
+ """
+ Args:
+ dtype: The data type of the transformation matrix.
+ to be used if `matrix = None`.
+ device: The device for storing the implemented transformation.
+ If `matrix != None`, uses the device of input `matrix`.
+ matrix: A tensor of shape (4, 4) or of shape (minibatch, 4, 4)
+ representing the 4x4 3D transformation matrix.
+ If `None`, initializes with identity using
+ the specified `device` and `dtype`.
+ """
+
+ if matrix is None:
+ self._matrix = torch.eye(4, dtype=dtype, device=device).view(1, 4, 4)
+ else:
+ if matrix.ndim not in (2, 3):
+ raise ValueError('"matrix" has to be a 2- or a 3-dimensional tensor.')
+ if matrix.shape[-2] != 4 or matrix.shape[-1] != 4:
+ raise ValueError(
+ '"matrix" has to be a tensor of shape (minibatch, 4, 4)'
+ )
+ # set dtype and device from matrix
+ dtype = matrix.dtype
+ device = matrix.device
+ self._matrix = matrix.view(-1, 4, 4)
+
+ self._transforms = [] # store transforms to compose
+ self._lu = None
+ self.device = make_device(device)
+ self.dtype = dtype
+
+ def __len__(self) -> int:
+ return self.get_matrix().shape[0]
+
+ def __getitem__(
+ self, index: Union[int, List[int], slice, torch.Tensor]
+ ) -> "Transform3d":
+ """
+ Args:
+ index: Specifying the index of the transform to retrieve.
+ Can be an int, slice, list of ints, boolean, long tensor.
+ Supports negative indices.
+
+ Returns:
+ Transform3d object with selected transforms. The tensors are not cloned.
+ """
+ if isinstance(index, int):
+ index = [index]
+ return self.__class__(matrix=self.get_matrix()[index])
+
+ def compose(self, *others: "Transform3d") -> "Transform3d":
+ """
+ Return a new Transform3d representing the composition of self with the
+ given other transforms, which will be stored as an internal list.
+
+ Args:
+ *others: Any number of Transform3d objects
+
+ Returns:
+ A new Transform3d with the stored transforms
+ """
+ out = Transform3d(dtype=self.dtype, device=self.device)
+ out._matrix = self._matrix.clone()
+ for other in others:
+ if not isinstance(other, Transform3d):
+ msg = "Only possible to compose Transform3d objects; got %s"
+ raise ValueError(msg % type(other))
+ out._transforms = self._transforms + list(others)
+ return out
+
+ def get_matrix(self) -> torch.Tensor:
+ """
+ Return a matrix which is the result of composing this transform
+ with others stored in self.transforms. Where necessary transforms
+ are broadcast against each other.
+ For example, if self.transforms contains transforms t1, t2, and t3, and
+ given a set of points x, the following should be true:
+
+ .. code-block:: python
+
+ y1 = t1.compose(t2, t3).transform(x)
+ y2 = t3.transform(t2.transform(t1.transform(x)))
+ y1.get_matrix() == y2.get_matrix()
+
+ Returns:
+ A transformation matrix representing the composed inputs.
+ """
+ composed_matrix = self._matrix.clone()
+ if len(self._transforms) > 0:
+ for other in self._transforms:
+ other_matrix = other.get_matrix()
+ composed_matrix = _broadcast_bmm(composed_matrix, other_matrix)
+ return composed_matrix
+
+ def _get_matrix_inverse(self) -> torch.Tensor:
+ """
+ Return the inverse of self._matrix.
+ """
+ return torch.inverse(self._matrix)
+
+ def inverse(self, invert_composed: bool = False) -> "Transform3d":
+ """
+ Returns a new Transform3d object that represents an inverse of the
+ current transformation.
+
+ Args:
+ invert_composed:
+ - True: First compose the list of stored transformations
+ and then apply inverse to the result. This is
+ potentially slower for classes of transformations
+ with inverses that can be computed efficiently
+ (e.g. rotations and translations).
+ - False: Invert the individual stored transformations
+ independently without composing them.
+
+ Returns:
+ A new Transform3d object containing the inverse of the original
+ transformation.
+ """
+
+ tinv = Transform3d(dtype=self.dtype, device=self.device)
+
+ if invert_composed:
+ # first compose then invert
+ tinv._matrix = torch.inverse(self.get_matrix())
+ else:
+ # self._get_matrix_inverse() implements efficient inverse
+ # of self._matrix
+ i_matrix = self._get_matrix_inverse()
+
+ # 2 cases:
+ if len(self._transforms) > 0:
+ # a) Either we have a non-empty list of transforms:
+ # Here we take self._matrix and append its inverse at the
+ # end of the reverted _transforms list. After composing
+ # the transformations with get_matrix(), this correctly
+ # right-multiplies by the inverse of self._matrix
+ # at the end of the composition.
+ tinv._transforms = [t.inverse() for t in reversed(self._transforms)]
+ last = Transform3d(dtype=self.dtype, device=self.device)
+ last._matrix = i_matrix
+ tinv._transforms.append(last)
+ else:
+ # b) Or there are no stored transformations
+ # we just set inverted matrix
+ tinv._matrix = i_matrix
+
+ return tinv
+
+ def stack(self, *others: "Transform3d") -> "Transform3d":
+ """
+ Return a new batched Transform3d representing the batch elements from
+ self and all the given other transforms all batched together.
+
+ Args:
+ *others: Any number of Transform3d objects
+
+ Returns:
+ A new Transform3d.
+ """
+ transforms = [self] + list(others)
+ matrix = torch.cat([t.get_matrix() for t in transforms], dim=0)
+ out = Transform3d(dtype=self.dtype, device=self.device)
+ out._matrix = matrix
+ return out
+
+ def transform_points(self, points, eps: Optional[float] = None) -> torch.Tensor:
+ """
+ Use this transform to transform a set of 3D points. Assumes row major
+ ordering of the input points.
+
+ Args:
+ points: Tensor of shape (P, 3) or (N, P, 3)
+ eps: If eps!=None, the argument is used to clamp the
+ last coordinate before performing the final division.
+ The clamping corresponds to:
+ last_coord := (last_coord.sign() + (last_coord==0)) *
+ torch.clamp(last_coord.abs(), eps),
+ i.e. the last coordinates that are exactly 0 will
+ be clamped to +eps.
+
+ Returns:
+ points_out: points of shape (N, P, 3) or (P, 3) depending
+ on the dimensions of the transform
+ """
+ points_batch = points.clone()
+ if points_batch.dim() == 2:
+ points_batch = points_batch[None] # (P, 3) -> (1, P, 3)
+ if points_batch.dim() != 3:
+ msg = "Expected points to have dim = 2 or dim = 3: got shape %r"
+ raise ValueError(msg % repr(points.shape))
+
+ N, P, _3 = points_batch.shape
+ ones = torch.ones(N, P, 1, dtype=points.dtype, device=points.device)
+ points_batch = torch.cat([points_batch, ones], dim=2)
+
+ composed_matrix = self.get_matrix()
+ points_out = _broadcast_bmm(points_batch, composed_matrix)
+ denom = points_out[..., 3:] # denominator
+ if eps is not None:
+ denom_sign = denom.sign() + (denom == 0.0).type_as(denom)
+ denom = denom_sign * torch.clamp(denom.abs(), eps)
+ points_out = points_out[..., :3] / denom
+
+ # When transform is (1, 4, 4) and points is (P, 3) return
+ # points_out of shape (P, 3)
+ if points_out.shape[0] == 1 and points.dim() == 2:
+ points_out = points_out.reshape(points.shape)
+
+ return points_out
+
+ def transform_normals(self, normals) -> torch.Tensor:
+ """
+ Use this transform to transform a set of normal vectors.
+
+ Args:
+ normals: Tensor of shape (P, 3) or (N, P, 3)
+
+ Returns:
+ normals_out: Tensor of shape (P, 3) or (N, P, 3) depending
+ on the dimensions of the transform
+ """
+ if normals.dim() not in [2, 3]:
+ msg = "Expected normals to have dim = 2 or dim = 3: got shape %r"
+ raise ValueError(msg % (normals.shape,))
+ composed_matrix = self.get_matrix()
+
+ # TODO: inverse is bad! Solve a linear system instead
+ mat = composed_matrix[:, :3, :3]
+ normals_out = _broadcast_bmm(normals, mat.transpose(1, 2).inverse())
+
+ # This doesn't pass unit tests. TODO investigate further
+ # if self._lu is None:
+ # self._lu = self._matrix[:, :3, :3].transpose(1, 2).lu()
+ # normals_out = normals.lu_solve(*self._lu)
+
+ # When transform is (1, 4, 4) and normals is (P, 3) return
+ # normals_out of shape (P, 3)
+ if normals_out.shape[0] == 1 and normals.dim() == 2:
+ normals_out = normals_out.reshape(normals.shape)
+
+ return normals_out
+
+ def translate(self, *args, **kwargs) -> "Transform3d":
+ return self.compose(
+ Translate(device=self.device, dtype=self.dtype, *args, **kwargs)
+ )
+
+ def scale(self, *args, **kwargs) -> "Transform3d":
+ return self.compose(
+ Scale(device=self.device, dtype=self.dtype, *args, **kwargs)
+ )
+
+ def rotate(self, *args, **kwargs) -> "Transform3d":
+ return self.compose(
+ Rotate(device=self.device, dtype=self.dtype, *args, **kwargs)
+ )
+
+ def rotate_axis_angle(self, *args, **kwargs) -> "Transform3d":
+ return self.compose(
+ RotateAxisAngle(device=self.device, dtype=self.dtype, *args, **kwargs)
+ )
+
+ def clone(self) -> "Transform3d":
+ """
+ Deep copy of Transforms object. All internal tensors are cloned
+ individually.
+
+ Returns:
+ new Transforms object.
+ """
+ other = Transform3d(dtype=self.dtype, device=self.device)
+ if self._lu is not None:
+ other._lu = [elem.clone() for elem in self._lu]
+ other._matrix = self._matrix.clone()
+ other._transforms = [t.clone() for t in self._transforms]
+ return other
+
+ def to(
+ self,
+ device: Device,
+ copy: bool = False,
+ dtype: Optional[torch.dtype] = None,
+ ) -> "Transform3d":
+ """
+ Match functionality of torch.Tensor.to()
+ If copy = True or the self Tensor is on a different device, the
+ returned tensor is a copy of self with the desired torch.device.
+ If copy = False and the self Tensor already has the correct torch.device,
+ then self is returned.
+
+ Args:
+ device: Device (as str or torch.device) for the new tensor.
+ copy: Boolean indicator whether or not to clone self. Default False.
+ dtype: If not None, casts the internal tensor variables
+ to a given torch.dtype.
+
+ Returns:
+ Transform3d object.
+ """
+ device_ = make_device(device)
+ dtype_ = self.dtype if dtype is None else dtype
+ skip_to = self.device == device_ and self.dtype == dtype_
+
+ if not copy and skip_to:
+ return self
+
+ other = self.clone()
+
+ if skip_to:
+ return other
+
+ other.device = device_
+ other.dtype = dtype_
+ other._matrix = other._matrix.to(device=device_, dtype=dtype_)
+ other._transforms = [
+ t.to(device_, copy=copy, dtype=dtype_) for t in other._transforms
+ ]
+ return other
+
+ def cpu(self) -> "Transform3d":
+ return self.to("cpu")
+
+ def cuda(self) -> "Transform3d":
+ return self.to("cuda")
+
+
+class Translate(Transform3d):
+ def __init__(
+ self,
+ x,
+ y=None,
+ z=None,
+ dtype: torch.dtype = torch.float32,
+ device: Optional[Device] = None,
+ ) -> None:
+ """
+ Create a new Transform3d representing 3D translations.
+
+ Option I: Translate(xyz, dtype=torch.float32, device='cpu')
+ xyz should be a tensor of shape (N, 3)
+
+ Option II: Translate(x, y, z, dtype=torch.float32, device='cpu')
+ Here x, y, and z will be broadcast against each other and
+ concatenated to form the translation. Each can be:
+ - A python scalar
+ - A torch scalar
+ - A 1D torch tensor
+ """
+ xyz = _handle_input(x, y, z, dtype, device, "Translate")
+ super().__init__(device=xyz.device, dtype=dtype)
+ N = xyz.shape[0]
+
+ mat = torch.eye(4, dtype=dtype, device=self.device)
+ mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+ mat[:, 3, :3] = xyz
+ self._matrix = mat
+
+ def _get_matrix_inverse(self) -> torch.Tensor:
+ """
+ Return the inverse of self._matrix.
+ """
+ inv_mask = self._matrix.new_ones([1, 4, 4])
+ inv_mask[0, 3, :3] = -1.0
+ i_matrix = self._matrix * inv_mask
+ return i_matrix
+
+
+class Scale(Transform3d):
+ def __init__(
+ self,
+ x,
+ y=None,
+ z=None,
+ dtype: torch.dtype = torch.float32,
+ device: Optional[Device] = None,
+ ) -> None:
+ """
+ A Transform3d representing a scaling operation, with different scale
+ factors along each coordinate axis.
+
+ Option I: Scale(s, dtype=torch.float32, device='cpu')
+ s can be one of
+ - Python scalar or torch scalar: Single uniform scale
+ - 1D torch tensor of shape (N,): A batch of uniform scale
+ - 2D torch tensor of shape (N, 3): Scale differently along each axis
+
+ Option II: Scale(x, y, z, dtype=torch.float32, device='cpu')
+ Each of x, y, and z can be one of
+ - python scalar
+ - torch scalar
+ - 1D torch tensor
+ """
+ xyz = _handle_input(x, y, z, dtype, device, "scale", allow_singleton=True)
+ super().__init__(device=xyz.device, dtype=dtype)
+ N = xyz.shape[0]
+
+ # TODO: Can we do this all in one go somehow?
+ mat = torch.eye(4, dtype=dtype, device=self.device)
+ mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+ mat[:, 0, 0] = xyz[:, 0]
+ mat[:, 1, 1] = xyz[:, 1]
+ mat[:, 2, 2] = xyz[:, 2]
+ self._matrix = mat
+
+ def _get_matrix_inverse(self) -> torch.Tensor:
+ """
+ Return the inverse of self._matrix.
+ """
+ xyz = torch.stack([self._matrix[:, i, i] for i in range(4)], dim=1)
+ ixyz = 1.0 / xyz
+ imat = torch.diag_embed(ixyz, dim1=1, dim2=2)
+ return imat
+
+
+class Rotate(Transform3d):
+ def __init__(
+ self,
+ R: torch.Tensor,
+ dtype: torch.dtype = torch.float32,
+ device: Optional[Device] = None,
+ orthogonal_tol: float = 1e-5,
+ ) -> None:
+ """
+ Create a new Transform3d representing 3D rotation using a rotation
+ matrix as the input.
+
+ Args:
+ R: a tensor of shape (3, 3) or (N, 3, 3)
+ orthogonal_tol: tolerance for the test of the orthogonality of R
+
+ """
+ device_ = get_device(R, device)
+ super().__init__(device=device_, dtype=dtype)
+ if R.dim() == 2:
+ R = R[None]
+ if R.shape[-2:] != (3, 3):
+ msg = "R must have shape (3, 3) or (N, 3, 3); got %s"
+ raise ValueError(msg % repr(R.shape))
+ R = R.to(device=device_, dtype=dtype)
+ _check_valid_rotation_matrix(R, tol=orthogonal_tol)
+ N = R.shape[0]
+ mat = torch.eye(4, dtype=dtype, device=device_)
+ mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+ mat[:, :3, :3] = R
+ self._matrix = mat
+
+ def _get_matrix_inverse(self) -> torch.Tensor:
+ """
+ Return the inverse of self._matrix.
+ """
+ return self._matrix.permute(0, 2, 1).contiguous()
+
+
+class RotateAxisAngle(Rotate):
+ def __init__(
+ self,
+ angle,
+ axis: str = "X",
+ degrees: bool = True,
+ dtype: torch.dtype = torch.float32,
+ device: Optional[Device] = None,
+ ) -> None:
+ """
+ Create a new Transform3d representing 3D rotation about an axis
+ by an angle.
+
+ Assuming a right-hand coordinate system, positive rotation angles result
+ in a counter clockwise rotation.
+
+ Args:
+ angle:
+ - A torch tensor of shape (N,)
+ - A python scalar
+ - A torch scalar
+ axis:
+ string: one of ["X", "Y", "Z"] indicating the axis about which
+ to rotate.
+ NOTE: All batch elements are rotated about the same axis.
+ """
+ axis = axis.upper()
+ if axis not in ["X", "Y", "Z"]:
+ msg = "Expected axis to be one of ['X', 'Y', 'Z']; got %s"
+ raise ValueError(msg % axis)
+ angle = _handle_angle_input(angle, dtype, device, "RotateAxisAngle")
+ angle = (angle / 180.0 * math.pi) if degrees else angle
+ # We assume the points on which this transformation will be applied
+ # are row vectors. The rotation matrix returned from _axis_angle_rotation
+ # is for transforming column vectors. Therefore we transpose this matrix.
+ # R will always be of shape (N, 3, 3)
+ R = _axis_angle_rotation(axis, angle).transpose(1, 2)
+ super().__init__(device=angle.device, R=R, dtype=dtype)
+
+
+def _handle_coord(c, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+ """
+ Helper function for _handle_input.
+
+ Args:
+ c: Python scalar, torch scalar, or 1D torch tensor
+
+ Returns:
+ c_vec: 1D torch tensor
+ """
+ if not torch.is_tensor(c):
+ c = torch.tensor(c, dtype=dtype, device=device)
+ if c.dim() == 0:
+ c = c.view(1)
+ if c.device != device or c.dtype != dtype:
+ c = c.to(device=device, dtype=dtype)
+ return c
+
+
+def _handle_input(
+ x,
+ y,
+ z,
+ dtype: torch.dtype,
+ device: Optional[Device],
+ name: str,
+ allow_singleton: bool = False,
+) -> torch.Tensor:
+ """
+ Helper function to handle parsing logic for building transforms. The output
+ is always a tensor of shape (N, 3), but there are several types of allowed
+ input.
+
+ Case I: Single Matrix
+ In this case x is a tensor of shape (N, 3), and y and z are None. Here just
+ return x.
+
+ Case II: Vectors and Scalars
+ In this case each of x, y, and z can be one of the following
+ - Python scalar
+ - Torch scalar
+ - Torch tensor of shape (N, 1) or (1, 1)
+ In this case x, y and z are broadcast to tensors of shape (N, 1)
+ and concatenated to a tensor of shape (N, 3)
+
+ Case III: Singleton (only if allow_singleton=True)
+ In this case y and z are None, and x can be one of the following:
+ - Python scalar
+ - Torch scalar
+ - Torch tensor of shape (N, 1) or (1, 1)
+ Here x will be duplicated 3 times, and we return a tensor of shape (N, 3)
+
+ Returns:
+ xyz: Tensor of shape (N, 3)
+ """
+ device_ = get_device(x, device)
+ # If x is actually a tensor of shape (N, 3) then just return it
+ if torch.is_tensor(x) and x.dim() == 2:
+ if x.shape[1] != 3:
+ msg = "Expected tensor of shape (N, 3); got %r (in %s)"
+ raise ValueError(msg % (x.shape, name))
+ if y is not None or z is not None:
+ msg = "Expected y and z to be None (in %s)" % name
+ raise ValueError(msg)
+ return x.to(device=device_, dtype=dtype)
+
+ if allow_singleton and y is None and z is None:
+ y = x
+ z = x
+
+ # Convert all to 1D tensors
+ xyz = [_handle_coord(c, dtype, device_) for c in [x, y, z]]
+
+ # Broadcast and concatenate
+ sizes = [c.shape[0] for c in xyz]
+ N = max(sizes)
+ for c in xyz:
+ if c.shape[0] != 1 and c.shape[0] != N:
+ msg = "Got non-broadcastable sizes %r (in %s)" % (sizes, name)
+ raise ValueError(msg)
+ xyz = [c.expand(N) for c in xyz]
+ xyz = torch.stack(xyz, dim=1)
+ return xyz
+
+
+def _handle_angle_input(
+ x, dtype: torch.dtype, device: Optional[Device], name: str
+) -> torch.Tensor:
+ """
+ Helper function for building a rotation function using angles.
+ The output is always of shape (N,).
+
+ The input can be one of:
+ - Torch tensor of shape (N,)
+ - Python scalar
+ - Torch scalar
+ """
+ device_ = get_device(x, device)
+ if torch.is_tensor(x) and x.dim() > 1:
+ msg = "Expected tensor of shape (N,); got %r (in %s)"
+ raise ValueError(msg % (x.shape, name))
+ else:
+ return _handle_coord(x, dtype, device_)
+
+
+def _broadcast_bmm(a, b) -> torch.Tensor:
+ """
+ Batch multiply two matrices and broadcast if necessary.
+
+ Args:
+ a: torch tensor of shape (P, K) or (M, P, K)
+ b: torch tensor of shape (N, K, K)
+
+ Returns:
+ a and b broadcast multiplied. The output batch dimension is max(N, M).
+
+ To broadcast transforms across a batch dimension if M != N then
+ expect that either M = 1 or N = 1. The tensor with batch dimension 1 is
+ expanded to have shape N or M.
+ """
+ if a.dim() == 2:
+ a = a[None]
+ if len(a) != len(b):
+ if not ((len(a) == 1) or (len(b) == 1)):
+ msg = "Expected batch dim for bmm to be equal or 1; got %r, %r"
+ raise ValueError(msg % (a.shape, b.shape))
+ if len(a) == 1:
+ a = a.expand(len(b), -1, -1)
+ if len(b) == 1:
+ b = b.expand(len(a), -1, -1)
+ return a.bmm(b)
+
+
+@torch.no_grad()
+def _check_valid_rotation_matrix(R, tol: float = 1e-7) -> None:
+ """
+ Determine if R is a valid rotation matrix by checking it satisfies the
+ following conditions:
+
+ ``RR^T = I and det(R) = 1``
+
+ Args:
+ R: an (N, 3, 3) matrix
+
+ Returns:
+ None
+
+ Emits a warning if R is an invalid rotation matrix.
+ """
+ N = R.shape[0]
+ eye = torch.eye(3, dtype=R.dtype, device=R.device)
+ eye = eye.view(1, 3, 3).expand(N, -1, -1)
+ orthogonal = torch.allclose(R.bmm(R.transpose(1, 2)), eye, atol=tol)
+ det_R = _safe_det_3x3(R)
+ no_distortion = torch.allclose(det_R, torch.ones_like(det_R))
+ if not (orthogonal and no_distortion):
+ msg = "R is not a valid rotation matrix"
+ warnings.warn(msg)
+ return
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/comm.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/comm.py
new file mode 100644
index 0000000..77f3bdb
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/comm.py
@@ -0,0 +1,105 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import logging
+from functools import wraps
+
+import torch.distributed as dist
+
+LOG = logging.getLogger(__name__)
+
+_NESTED_BROADCAST_FROM_MASTER = False
+
+
+def get_world_size() -> int:
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size()
+
+def is_distributed():
+ return get_world_size() > 1
+
+
+def broadcast_from_master(fn):
+ """If distributed, only the master executes the function and broadcast the results to other workers.
+
+ Usage:
+ @broadcast_from_master
+ def foo(a, b): ...
+ """
+ @wraps(fn)
+ def wrapper(*args, **kwargs): # pylint: disable=unused-argument
+ global _NESTED_BROADCAST_FROM_MASTER
+
+ if not is_distributed():
+ return fn(*args, **kwargs)
+
+ if _NESTED_BROADCAST_FROM_MASTER:
+ assert d2_comm.is_main_process()
+ LOG.warning(f"_NESTED_BROADCAST_FROM_MASTER = True, {fn.__name__}")
+ return fn(*args, **kwargs)
+
+ if d2_comm.is_main_process():
+ _NESTED_BROADCAST_FROM_MASTER = True
+ ret = [fn(*args, **kwargs), ]
+ _NESTED_BROADCAST_FROM_MASTER = False
+ else:
+ ret = [None, ]
+ if dist.is_initialized():
+ dist.broadcast_object_list(ret)
+ ret = ret[0]
+
+ assert ret is not None
+ return ret
+
+ return wrapper
+
+
+def master_only(fn):
+ """If distributed, only the master executes the function.
+
+ Usage:
+ @master_only
+ def foo(a, b): ...
+ """
+ @wraps(fn)
+ def wrapped_fn(*args, **kwargs):
+ if d2_comm.is_main_process():
+ ret = fn(*args, **kwargs)
+ d2_comm.synchronize()
+ if d2_comm.is_main_process():
+ return ret
+
+ return wrapped_fn
+
+
+def gather_dict(dikt):
+ """Gather python dictionaries from all workers to the rank=0 worker.
+
+ Assumption: the keys of `dikt` are disjoint across all workers.
+
+ If rank = 0, then returned aggregated dict.
+ If rank > 0, then return `None`.
+ """
+ dict_lst = d2_comm.gather(dikt, dst=0)
+ if d2_comm.is_main_process():
+ gathered_dict = {}
+ for dic in dict_lst:
+ for k in dic.keys():
+ assert k not in gathered_dict, f"Dictionary key overlaps: {k}"
+ gathered_dict.update(dic)
+ return gathered_dict
+ else:
+ return None
+
+
+def reduce_sum(tensor):
+ """
+ Adapted from AdelaiDet:
+ https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py
+ """
+ if not is_distributed():
+ return tensor
+ tensor = tensor.clone()
+ dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+ return tensor
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/geometry.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/geometry.py
new file mode 100644
index 0000000..d8f546b
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/geometry.py
@@ -0,0 +1,204 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import logging
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+LOG = logging.getLogger(__name__)
+
+PI = 3.14159265358979323846
+EPS = 1e-7
+
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+ """
+ Returns torch.sqrt(torch.max(0, x))
+ but with a zero subgradient where x is 0.
+ """
+ ret = torch.zeros_like(x)
+ positive_mask = x > 0
+ ret[positive_mask] = torch.sqrt(x[positive_mask])
+ return ret
+
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+ """
+ Convert rotations given as rotation matrices to quaternions.
+
+ Args:
+ matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+ Returns:
+ quaternions with real part first, as tensor of shape (..., 4).
+ """
+ if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+ raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+ batch_dim = matrix.shape[:-2]
+ m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+ matrix.reshape(batch_dim + (9,)), dim=-1
+ )
+
+ q_abs = _sqrt_positive_part(
+ torch.stack(
+ [
+ 1.0 + m00 + m11 + m22,
+ 1.0 + m00 - m11 - m22,
+ 1.0 - m00 + m11 - m22,
+ 1.0 - m00 - m11 + m22,
+ ],
+ dim=-1,
+ )
+ )
+
+ # we produce the desired quaternion multiplied by each of r, i, j, k
+ quat_by_rijk = torch.stack(
+ [
+ torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+ torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+ torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+ torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+ ],
+ dim=-2,
+ )
+
+ # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+ # the candidate won't be picked.
+ flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+ quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+ # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+ # forall i; we pick the best-conditioned one (with the largest denominator)
+
+ return quat_candidates[
+ F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, : # pyre-ignore[16]
+ ].reshape(batch_dim + (4,))
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+ """
+ Convert rotations given as quaternions to rotation matrices.
+
+ Args:
+ quaternions: quaternions with real part first,
+ as tensor of shape (..., 4).
+
+ Returns:
+ Rotation matrices as tensor of shape (..., 3, 3).
+ """
+ r, i, j, k = torch.unbind(quaternions, -1)
+ two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+ o = torch.stack(
+ (
+ 1 - two_s * (j * j + k * k),
+ two_s * (i * j - k * r),
+ two_s * (i * k + j * r),
+ two_s * (i * j + k * r),
+ 1 - two_s * (i * i + k * k),
+ two_s * (j * k - i * r),
+ two_s * (i * k - j * r),
+ two_s * (j * k + i * r),
+ 1 - two_s * (i * i + j * j),
+ ),
+ -1,
+ )
+ return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+def allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics):
+ """
+ Parameters
+ ----------
+ quat: Tensor
+ (N, 4). Batch of (allocentric) quaternions.
+
+ proj_ctr: Tensor
+ (N, 2). Projected centers. xy coordninates.
+
+ inv_intrinsics: [type]
+ (N, 3, 3). Inverted intrinsics.
+ """
+ R_obj_to_local = quaternion_to_matrix(quat)
+
+ # ray == z-axis in local orientaion
+ ray = unproject_points2d(proj_ctr, inv_intrinsics)
+ z = ray / ray.norm(dim=1, keepdim=True)
+
+ # gram-schmit process: local_y = global_y - global_y \dot local_z
+ y = z.new_tensor([[0., 1., 0.]]) - z[:, 1:2] * z
+ y = y / y.norm(dim=1, keepdim=True)
+ x = torch.cross(y, z, dim=1)
+
+ # local -> global
+ R_local_to_global = torch.stack([x, y, z], dim=-1)
+
+ # obj -> global
+ R_obj_to_global = torch.bmm(R_local_to_global, R_obj_to_local)
+
+ egocentric_quat = matrix_to_quaternion(R_obj_to_global)
+
+ # Make sure it's unit norm.
+ quat_norm = egocentric_quat.norm(dim=1, keepdim=True)
+ if not torch.allclose(quat_norm, torch.as_tensor(1.), atol=1e-3):
+ LOG.warning(
+ f"Some of the input quaternions are not unit norm: min={quat_norm.min()}, max={quat_norm.max()}; therefore normalizing."
+ )
+ egocentric_quat = egocentric_quat / quat_norm.clamp(min=EPS)
+
+ return egocentric_quat
+
+
+def homogenize_points(xy):
+ """
+ Parameters
+ ----------
+ xy: Tensor
+ xy coordinates. shape=(N, ..., 2)
+ E.g., (N, 2) or (N, K, 2) or (N, H, W, 2)
+
+ Returns
+ -------
+ Tensor:
+ 1. is appended to the last dimension. shape=(N, ..., 3)
+ E.g, (N, 3) or (N, K, 3) or (N, H, W, 3).
+ """
+ # NOTE: this seems to work for arbitrary number of dimensions of input
+ pad = torch.nn.ConstantPad1d(padding=(0, 1), value=1.)
+ return pad(xy)
+
+
+def project_points3d(Xw, K):
+ _, C = Xw.shape
+ assert C == 3
+ uv, _ = cv2.projectPoints(
+ Xw, np.zeros((3, 1), dtype=np.float32), np.zeros(3, dtype=np.float32), K, np.zeros(5, dtype=np.float32)
+ )
+ return uv.reshape(-1, 2)
+
+
+def unproject_points2d(points2d, inv_K, scale=1.0):
+ """
+ Parameters
+ ----------
+ points2d: Tensor
+ xy coordinates. shape=(N, ..., 2)
+ E.g., (N, 2) or (N, K, 2) or (N, H, W, 2)
+
+ inv_K: Tensor
+ Inverted intrinsics; shape=(N, 3, 3)
+
+ scale: float, default: 1.0
+ Scaling factor.
+
+ Returns
+ -------
+ Tensor:
+ Unprojected 3D point. shape=(N, ..., 3)
+ E.g., (N, 3) or (N, K, 3) or (N, H, W, 3)
+ """
+ points2d = homogenize_points(points2d)
+ siz = points2d.size()
+ points2d = points2d.view(-1, 3).unsqueeze(-1) # (N, 3, 1)
+ unprojected = torch.matmul(inv_K, points2d) # (N, 3, 3) x (N, 3, 1) -> (N, 3, 1)
+ unprojected = unprojected.view(siz)
+
+ return unprojected * scale
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tasks.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tasks.py
new file mode 100644
index 0000000..997fbb3
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tasks.py
@@ -0,0 +1,97 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+from collections import OrderedDict
+
+# from detectron2.config import configurable
+
+
+class Task():
+ def __init__(self, name, is_detection_task, is_dense_prediction_task):
+ self.name = name
+ self.is_detection_task = is_detection_task
+ self.is_dense_prediction_task = is_dense_prediction_task
+
+
+# yapf: disable
+TASKS = [
+ Task(
+ name="box2d",
+ is_detection_task=True,
+ is_dense_prediction_task=False,
+ ),
+ Task(
+ name="box3d",
+ is_detection_task=True,
+ is_dense_prediction_task=False,
+ ),
+ Task(
+ name="depth",
+ is_detection_task=False,
+ is_dense_prediction_task=True,
+ )
+]
+# yapf: enable
+
+NAME_TO_TASK = OrderedDict([(task.name, task) for task in TASKS])
+
+
+class TaskManager():
+ #@configurable
+ def __init__(self, box2d_on=False, box3d_on=False, depth_on=False):
+ """
+ configurable is experimental.
+ """
+ self._box2d_on = self._mask2d_on = self._box3d_on = self._semseg2d_on = self._depth_on = False
+ tasks = []
+ if box2d_on:
+ tasks.append(NAME_TO_TASK['box2d'])
+ self._box2d_on = True
+ if box3d_on:
+ tasks.append(NAME_TO_TASK['box3d'])
+ self._box3d_on = True
+ if depth_on:
+ tasks.append(NAME_TO_TASK['depth'])
+ self._depth_on = True
+
+ if not tasks:
+ raise ValueError("No task specified.")
+
+ self._tasks = tasks
+
+ @property
+ def tasks(self):
+ return self._tasks
+
+ '''@classmethod
+ def from_config(cls, cfg):
+ # yapf: disable
+ return OrderedDict(
+ box2d_on = cfg.MODEL.BOX2D_ON,
+ box3d_on = cfg.MODEL.BOX3D_ON,
+ depth_on = cfg.MODEL.DEPTH_ON,
+ )
+ # yapf: enable'''
+
+ # Indicators that tells if each task is enabled.
+ @property
+ def box2d_on(self):
+ return self._box2d_on
+
+ @property
+ def box3d_on(self):
+ return self._box3d_on
+
+ @property
+ def depth_on(self):
+ return self._depth_on
+
+ @property
+ def has_dense_prediction_task(self):
+ return any([task.is_dense_prediction_task for task in self.tasks])
+
+ @property
+ def has_detection_task(self):
+ return any([task.is_detection_task for task in self.tasks])
+
+ @property
+ def task_names(self):
+ return [task.name for task in self.tasks]
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tensor2d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tensor2d.py
new file mode 100644
index 0000000..2922567
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tensor2d.py
@@ -0,0 +1,47 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+
+def compute_features_locations(h, w, stride, dtype=torch.float32, device='cpu', offset="none"):
+ """Adapted from AdelaiDet:
+ https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py
+
+ Key differnece: offset is configurable.
+ """
+ shifts_x = torch.arange(0, w * stride, step=stride, dtype=dtype, device=device)
+ shifts_y = torch.arange(0, h * stride, step=stride, dtype=dtype, device=device)
+ shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+ shift_x = shift_x.reshape(-1)
+ shift_y = shift_y.reshape(-1)
+ # (dennis.park)
+ # locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
+ locations = torch.stack((shift_x, shift_y), dim=1)
+ if offset == "half":
+ locations += stride // 2
+ else:
+ assert offset == "none"
+
+ return locations
+
+
+def aligned_bilinear(tensor, factor, offset="none"):
+ """Adapted from AdelaiDet:
+ https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py
+ """
+ assert tensor.dim() == 4
+ assert factor >= 1
+ assert int(factor) == factor
+
+ if factor == 1:
+ return tensor
+
+ h, w = tensor.size()[2:]
+ tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
+ oh = factor * h + 1
+ ow = factor * w + 1
+ tensor = F.interpolate(tensor, size=(oh, ow), mode='bilinear', align_corners=True)
+ if offset == "half":
+ tensor = F.pad(tensor, pad=(factor // 2, 0, factor // 2, 0), mode="replicate")
+
+ return tensor[:, :, :oh - 1, :ow - 1]
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/visualization.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/visualization.py
new file mode 100644
index 0000000..71e78b1
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/visualization.py
@@ -0,0 +1,147 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+import colorsys
+import os
+
+import cv2
+import matplotlib.colors as mplc
+import numpy as np
+from PIL import Image, ImageDraw
+
+
+def fill_color_polygon(image, polygon, color, alpha=0.5):
+ """Color interior of polygon with alpha-blending. This function modified input in place.
+ """
+ _mask = Image.new('L', (image.shape[1], image.shape[0]), 0)
+ ImageDraw.Draw(_mask).polygon(polygon, outline=1, fill=1)
+ mask = np.array(_mask, np.bool)
+ for c in range(3):
+ channel = image[:, :, c]
+ channel[mask] = channel[mask] * (1. - alpha) + color[c] * alpha
+
+
+def change_color_brightness(color, brightness_factor):
+ """
+ Copied from detectron2.utils.visualizer.py
+ -------------------------------------------
+
+ Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+ less or more saturation than the original color.
+
+ Args:
+ color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+ formats that are accepted.
+ brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+ 0 will correspond to no change, a factor in [-1.0, 0) range will result in
+ a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+ Returns:
+ modified_color (tuple[double]): a tuple containing the RGB values of the
+ modified color. Each value in the tuple is in the [0.0, 1.0] range.
+ """
+ assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+ color = mplc.to_rgb(color)
+ polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+ modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+ modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+ modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+ modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+ return modified_color
+
+
+def draw_text(ax, text, position, *, font_size, color="g", horizontal_alignment="center", rotation=0):
+ """
+ Copied from Visualizer.draw_text()
+ -----------------------------------
+
+ Args:
+ text (str): class label
+ position (tuple): a tuple of the x and y coordinates to place text on image.
+ font_size (int, optional): font of the text. If not provided, a font size
+ proportional to the image width is calculated and used.
+ color: color of the text. Refer to `matplotlib.colors` for full list
+ of formats that are accepted.
+ horizontal_alignment (str): see `matplotlib.text.Text`
+ rotation: rotation angle in degrees CCW
+
+ Returns:
+ output (VisImage): image object with text drawn.
+ """
+ # since the text background is dark, we don't want the text to be dark
+ color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+ color[np.argmax(color)] = max(0.8, np.max(color))
+
+ x, y = position
+ ax.text(
+ x,
+ y,
+ text,
+ size=font_size,
+ family="sans-serif",
+ bbox={
+ "facecolor": "black",
+ "alpha": 0.8,
+ "pad": 0.7,
+ "edgecolor": "none"
+ },
+ verticalalignment="top",
+ horizontalalignment=horizontal_alignment,
+ color=color,
+ zorder=10,
+ rotation=rotation,
+ )
+ return ax
+
+
+def float_to_uint8_color(float_clr):
+ assert all([c >= 0. for c in float_clr])
+ assert all([c <= 1. for c in float_clr])
+ return [int(c * 255.) for c in float_clr]
+
+
+def mosaic(items, scale=1.0, pad=3, grid_width=None):
+ """Creates a mosaic from list of images.
+
+ Parameters
+ ----------
+ items: list of np.ndarray
+ List of images to mosaic.
+
+ scale: float, default=1.0
+ Scale factor applied to images. scale > 1.0 enlarges images.
+
+ pad: int, default=3
+ Padding size of the images before mosaic
+
+ grid_width: int, default=None
+ Mosaic width or grid width of the mosaic
+
+ Returns
+ -------
+ image: np.array of shape (H, W, 3)
+ Image mosaic
+ """
+ # Determine tile width and height
+ N = len(items)
+ assert N > 0, 'No items to mosaic!'
+ grid_width = grid_width if grid_width else np.ceil(np.sqrt(N)).astype(int)
+ grid_height = np.ceil(N * 1. / grid_width).astype(np.int)
+ input_size = items[0].shape[:2]
+ target_shape = (int(input_size[1] * scale), int(input_size[0] * scale))
+ mosaic_items = []
+ for j in range(grid_width * grid_height):
+ if j < N:
+ # Only the first image is scaled, the rest are re-shaped
+ # to the same size as the previous image in the mosaic
+ im = cv2.resize(items[j], dsize=target_shape)
+ mosaic_items.append(im)
+ else:
+ mosaic_items.append(np.zeros_like(mosaic_items[-1]))
+
+ # Stack W tiles horizontally first, then vertically
+ im_pad = lambda im: cv2.copyMakeBorder(im, pad, pad, pad, pad, cv2.BORDER_CONSTANT, 0)
+ mosaic_items = [im_pad(im) for im in mosaic_items]
+ hstack = [np.hstack(mosaic_items[j:j + grid_width]) for j in range(0, len(mosaic_items), grid_width)]
+ mosaic_viz = np.vstack(hstack) if len(hstack) > 1 \
+ else hstack[0]
+ return mosaic_viz
diff --git a/adzoo/bevformer/mmdet3d_plugin/models/hooks/__init__.py b/adzoo/bevformer/mmdet3d_plugin/models/hooks/__init__.py
new file mode 100644
index 0000000..93b13c9
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/models/hooks/__init__.py
@@ -0,0 +1 @@
+from .hooks import GradChecker
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/models/hooks/hooks.py b/adzoo/bevformer/mmdet3d_plugin/models/hooks/hooks.py
new file mode 100644
index 0000000..56ff7fd
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/models/hooks/hooks.py
@@ -0,0 +1,13 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+
+
+@HOOKS.register_module()
+class GradChecker(Hook):
+
+ def after_train_iter(self, runner):
+ for key, val in runner.model.named_parameters():
+ if val.grad == None and val.requires_grad:
+ print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key))
+
+
diff --git a/adzoo/bevformer/model_converters/convert_votenet_checkpoints.py b/adzoo/bevformer/model_converters/convert_votenet_checkpoints.py
new file mode 100755
index 0000000..33792b0
--- /dev/null
+++ b/adzoo/bevformer/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+import torch
+from mmcv import Config
+from mmcv.runner import load_state_dict
+
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+ parser.add_argument('checkpoint', help='checkpoint file')
+ parser.add_argument('--out', help='path of the output checkpoint file')
+ args = parser.parse_args()
+ return args
+
+
+def parse_config(config_strings):
+ """Parse config from strings.
+
+ Args:
+ config_strings (string): strings of model config.
+
+ Returns:
+ Config: model config
+ """
+ temp_file = tempfile.NamedTemporaryFile()
+ config_path = f'{temp_file.name}.py'
+ with open(config_path, 'w') as f:
+ f.write(config_strings)
+
+ config = Config.fromfile(config_path)
+
+ # Update backbone config
+ if 'pool_mod' in config.model.backbone:
+ config.model.backbone.pop('pool_mod')
+
+ if 'sa_cfg' not in config.model.backbone:
+ config.model.backbone['sa_cfg'] = dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True)
+
+ if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+ config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+ # Update bbox_head config
+ if 'pred_layer_cfg' not in config.model.bbox_head:
+ config.model.bbox_head['pred_layer_cfg'] = dict(
+ in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+ if 'feat_channels' in config.model.bbox_head:
+ config.model.bbox_head.pop('feat_channels')
+
+ if 'vote_moudule_cfg' in config.model.bbox_head:
+ config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+ 'vote_moudule_cfg')
+
+ if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+ config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+ temp_file.close()
+
+ return config
+
+
+def main():
+ """Convert keys in checkpoints for VoteNet.
+
+ There can be some breaking changes during the development of mmdetection3d,
+ and this tool is used for upgrading checkpoints trained with old versions
+ (before v0.6.0) to the latest one.
+ """
+ args = parse_args()
+ checkpoint = torch.load(args.checkpoint)
+ cfg = parse_config(checkpoint['meta']['config'])
+ # Build the model and load checkpoint
+ model = build_detector(
+ cfg.model,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+ orig_ckpt = checkpoint['state_dict']
+ converted_ckpt = orig_ckpt.copy()
+
+ if cfg['dataset_type'] == 'ScanNetDataset':
+ NUM_CLASSES = 18
+ elif cfg['dataset_type'] == 'SUNRGBDDataset':
+ NUM_CLASSES = 10
+ else:
+ raise NotImplementedError
+
+ RENAME_PREFIX = {
+ 'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+ 'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+ }
+
+ DEL_KEYS = [
+ 'bbox_head.conv_pred.0.bn.num_batches_tracked',
+ 'bbox_head.conv_pred.1.bn.num_batches_tracked'
+ ]
+
+ EXTRACT_KEYS = {
+ 'bbox_head.conv_pred.conv_cls.weight':
+ ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+ 'bbox_head.conv_pred.conv_cls.bias':
+ ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+ 'bbox_head.conv_pred.conv_reg.weight':
+ ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+ 'bbox_head.conv_pred.conv_reg.bias':
+ ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+ }
+
+ # Delete some useless keys
+ for key in DEL_KEYS:
+ converted_ckpt.pop(key)
+
+ # Rename keys with specific prefix
+ RENAME_KEYS = dict()
+ for old_key in converted_ckpt.keys():
+ for rename_prefix in RENAME_PREFIX.keys():
+ if rename_prefix in old_key:
+ new_key = old_key.replace(rename_prefix,
+ RENAME_PREFIX[rename_prefix])
+ RENAME_KEYS[new_key] = old_key
+ for new_key, old_key in RENAME_KEYS.items():
+ converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+ # Extract weights and rename the keys
+ for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+ cur_layers = orig_ckpt[old_key]
+ converted_layers = []
+ for (start, end) in indices:
+ if end != -1:
+ converted_layers.append(cur_layers[start:end])
+ else:
+ converted_layers.append(cur_layers[start:])
+ converted_layers = torch.cat(converted_layers, 0)
+ converted_ckpt[new_key] = converted_layers
+ if old_key in converted_ckpt.keys():
+ converted_ckpt.pop(old_key)
+
+ # Check the converted checkpoint by loading to the model
+ load_state_dict(model, converted_ckpt, strict=True)
+ checkpoint['state_dict'] = converted_ckpt
+ torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/model_converters/publish_model.py b/adzoo/bevformer/model_converters/publish_model.py
new file mode 100755
index 0000000..318fd46
--- /dev/null
+++ b/adzoo/bevformer/model_converters/publish_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+import torch
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Process a checkpoint to be published')
+ parser.add_argument('in_file', help='input checkpoint filename')
+ parser.add_argument('out_file', help='output checkpoint filename')
+ args = parser.parse_args()
+ return args
+
+
+def process_checkpoint(in_file, out_file):
+ checkpoint = torch.load(in_file, map_location='cpu')
+ # remove optimizer for smaller file size
+ if 'optimizer' in checkpoint:
+ del checkpoint['optimizer']
+ # if it is necessary to remove some sensitive data in checkpoint['meta'],
+ # add the code here.
+ torch.save(checkpoint, out_file)
+ sha = subprocess.check_output(['sha256sum', out_file]).decode()
+ final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+ subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+ args = parse_args()
+ process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/model_converters/regnet2mmdet.py b/adzoo/bevformer/model_converters/regnet2mmdet.py
new file mode 100755
index 0000000..9dee3c8
--- /dev/null
+++ b/adzoo/bevformer/model_converters/regnet2mmdet.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from collections import OrderedDict
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+ new_key = model_key.replace('stem.conv', 'conv1')
+ new_key = new_key.replace('stem.bn', 'bn1')
+ state_dict[new_key] = model_weight
+ converted_names.add(model_key)
+ print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+ new_key = model_key.replace('head.fc', 'fc')
+ state_dict[new_key] = model_weight
+ converted_names.add(model_key)
+ print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+ split_keys = model_key.split('.')
+ layer, block, module = split_keys[:3]
+ block_id = int(block[1:])
+ layer_name = f'layer{int(layer[1:])}'
+ block_name = f'{block_id - 1}'
+
+ if block_id == 1 and module == 'bn':
+ new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+ elif block_id == 1 and module == 'proj':
+ new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+ elif module == 'f':
+ if split_keys[3] == 'a_bn':
+ module_name = 'bn1'
+ elif split_keys[3] == 'b_bn':
+ module_name = 'bn2'
+ elif split_keys[3] == 'c_bn':
+ module_name = 'bn3'
+ elif split_keys[3] == 'a':
+ module_name = 'conv1'
+ elif split_keys[3] == 'b':
+ module_name = 'conv2'
+ elif split_keys[3] == 'c':
+ module_name = 'conv3'
+ new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+ else:
+ raise ValueError(f'Unsupported conversion of key {model_key}')
+ print(f'Convert {model_key} to {new_key}')
+ state_dict[new_key] = model_weight
+ converted_names.add(model_key)
+
+
+def convert(src, dst):
+ """Convert keys in pycls pretrained RegNet models to mmdet style."""
+ # load caffe model
+ regnet_model = torch.load(src)
+ blobs = regnet_model['model_state']
+ # convert to pytorch style
+ state_dict = OrderedDict()
+ converted_names = set()
+ for key, weight in blobs.items():
+ if 'stem' in key:
+ convert_stem(key, weight, state_dict, converted_names)
+ elif 'head' in key:
+ convert_head(key, weight, state_dict, converted_names)
+ elif key.startswith('s'):
+ convert_reslayer(key, weight, state_dict, converted_names)
+
+ # check if all layers are converted
+ for key in blobs:
+ if key not in converted_names:
+ print(f'not converted: {key}')
+ # save checkpoint
+ checkpoint = dict()
+ checkpoint['state_dict'] = state_dict
+ torch.save(checkpoint, dst)
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Convert model keys')
+ parser.add_argument('src', help='src detectron model path')
+ parser.add_argument('dst', help='save path')
+ args = parser.parse_args()
+ convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/test.py b/adzoo/bevformer/test.py
new file mode 100755
index 0000000..ca3a035
--- /dev/null
+++ b/adzoo/bevformer/test.py
@@ -0,0 +1,259 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import argparse
+import os
+import torch
+import warnings
+from mmcv.utils import get_dist_info, init_dist, wrap_fp16_model, set_random_seed, Config, DictAction, load_checkpoint
+from mmcv.models import build_model, fuse_conv_bn
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from mmcv.datasets import build_dataset, build_dataloader, replace_ImageToTensor
+import time
+import os.path as osp
+from adzoo.bevformer.apis.test import custom_multi_gpu_test
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet test (and eval) a model')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('checkpoint', help='checkpoint file')
+ parser.add_argument('--out', help='output result file in pickle format')
+ parser.add_argument(
+ '--fuse-conv-bn',
+ action='store_true',
+ help='Whether to fuse conv and bn, this will slightly increase'
+ 'the inference speed')
+ parser.add_argument(
+ '--format-only',
+ action='store_true',
+ help='Format the output results without perform evaluation. It is'
+ 'useful when you want to format the result to a specific format and '
+ 'submit it to the test server')
+ parser.add_argument(
+ '--eval',
+ type=str,
+ nargs='+',
+ help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+ ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+ parser.add_argument('--show', action='store_true', help='show results')
+ parser.add_argument(
+ '--show-dir', help='directory where results will be saved')
+ parser.add_argument(
+ '--gpu-collect',
+ action='store_true',
+ help='whether to use gpu to collect results.')
+ parser.add_argument(
+ '--tmpdir',
+ help='tmp directory used for collecting results from multiple '
+ 'workers, available when gpu-collect is not specified')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ parser.add_argument(
+ '--options',
+ nargs='+',
+ action=DictAction,
+ help='custom options for evaluation, the key-value pair in xxx=yyy '
+ 'format will be kwargs for dataset.evaluate() function (deprecate), '
+ 'change to --eval-options instead.')
+ parser.add_argument(
+ '--eval-options',
+ nargs='+',
+ action=DictAction,
+ help='custom options for evaluation, the key-value pair in xxx=yyy '
+ 'format will be kwargs for dataset.evaluate() function')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local-rank', type=int, default=0)
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ if args.options and args.eval_options:
+ raise ValueError(
+ '--options and --eval-options cannot be both specified, '
+ '--options is deprecated in favor of --eval-options')
+ if args.options:
+ warnings.warn('--options is deprecated in favor of --eval-options')
+ args.eval_options = args.options
+ return args
+
+
+def main():
+ args = parse_args()
+
+ assert args.out or args.eval or args.format_only or args.show \
+ or args.show_dir, \
+ ('Please specify at least one operation (save/eval/format/show the '
+ 'results / save the results) with the argument "--out", "--eval"'
+ ', "--format-only", "--show" or "--show-dir"')
+
+ if args.eval and args.format_only:
+ raise ValueError('--eval and --format_only cannot be both specified')
+
+ if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+ raise ValueError('The output file must be a pkl file.')
+
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+
+ # # import modules from plguin/xx, registry will be updated
+ # if hasattr(cfg, 'plugin'):
+ # if cfg.plugin:
+ # import importlib
+ # if hasattr(cfg, 'plugin_dir'):
+ # plugin_dir = cfg.plugin_dir
+ # _module_dir = os.path.dirname(plugin_dir)
+ # _module_dir = _module_dir.split('/')
+ # _module_path = _module_dir[0]
+
+ # for m in _module_dir[1:]:
+ # _module_path = _module_path + '.' + m
+ # print(_module_path)
+ # plg_lib = importlib.import_module(_module_path)
+ # else:
+ # # import dir is the dirpath for the config file
+ # _module_dir = os.path.dirname(args.config)
+ # _module_dir = _module_dir.split('/')
+ # _module_path = _module_dir[0]
+ # for m in _module_dir[1:]:
+ # _module_path = _module_path + '.' + m
+ # print(_module_path)
+ # plg_lib = importlib.import_module(_module_path)
+
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+ # set tf32
+ if cfg.get('close_tf32', False):
+ torch.backends.cuda.matmul.allow_tf32 = False
+ torch.backends.cudnn.allow_tf32 = False
+
+ cfg.model.pretrained = None
+ # in case the test dataset is concatenated
+ samples_per_gpu = 1
+ if isinstance(cfg.data.test, dict):
+ cfg.data.test.test_mode = True
+ samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+ if samples_per_gpu > 1:
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.test.pipeline = replace_ImageToTensor(
+ cfg.data.test.pipeline)
+ elif isinstance(cfg.data.test, list):
+ for ds_cfg in cfg.data.test:
+ ds_cfg.test_mode = True
+ samples_per_gpu = max(
+ [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+ if samples_per_gpu > 1:
+ for ds_cfg in cfg.data.test:
+ ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ else:
+ distributed = True
+ init_dist(args.launcher, **cfg.dist_params)
+
+ # set random seeds
+ if args.seed is not None:
+ set_random_seed(args.seed, deterministic=args.deterministic)
+
+ # build the dataloader
+ dataset = build_dataset(cfg.data.test)
+ data_loader = build_dataloader(
+ dataset,
+ samples_per_gpu=samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+ )
+
+ # build the model and load checkpoint
+ cfg.model.train_cfg = None
+ model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(model)
+ checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+ if args.fuse_conv_bn:
+ model = fuse_conv_bn(model)
+ # old versions did not save class info in checkpoints, this walkaround is
+ # for backward compatibility
+ if 'CLASSES' in checkpoint.get('meta', {}):
+ model.CLASSES = checkpoint['meta']['CLASSES']
+ else:
+ model.CLASSES = dataset.CLASSES
+ # palette for visualization in segmentation tasks
+ if 'PALETTE' in checkpoint.get('meta', {}):
+ model.PALETTE = checkpoint['meta']['PALETTE']
+ elif hasattr(dataset, 'PALETTE'):
+ # segmentation dataset has `PALETTE` attribute
+ model.PALETTE = dataset.PALETTE
+
+ if not distributed:
+ assert False
+ # model = MMDataParallel(model, device_ids=[0])
+ # outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
+ else:
+ model = DistributedDataParallel(
+ model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False)
+ outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
+ args.gpu_collect)
+
+ rank, _ = get_dist_info()
+ if rank == 0:
+ if args.out:
+ print(f'\nwriting results to {args.out}')
+ assert False
+ #mmcv.dump(outputs['bbox_results'], args.out)
+ kwargs = {} if args.eval_options is None else args.eval_options
+ kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
+ '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+ if args.format_only:
+ dataset.format_results(outputs, **kwargs)
+
+ if args.eval:
+ eval_kwargs = cfg.get('evaluation', {}).copy()
+ # hard-code way to remove EvalHook args
+ for key in [
+ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+ 'rule'
+ ]:
+ eval_kwargs.pop(key, None)
+ eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+ print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/bevformer/train.py b/adzoo/bevformer/train.py
new file mode 100755
index 0000000..ce20ce4
--- /dev/null
+++ b/adzoo/bevformer/train.py
@@ -0,0 +1,237 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.utils import get_dist_info, init_dist
+from os import path as osp
+
+
+from mmcv.datasets import build_dataset
+from mmcv.models import build_model
+from mmcv.utils import collect_env, get_root_logger
+from mmcv.utils import set_random_seed
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from adzoo.bevformer.mmdet3d_plugin.bevformer.apis.train import custom_train_model
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a detector')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--work-dir', help='the dir to save logs and models')
+ parser.add_argument(
+ '--resume-from', help='the checkpoint file to resume from')
+ parser.add_argument(
+ '--load-from', help='the checkpoint file to resume from')
+ parser.add_argument(
+ '--no-validate',
+ action='store_true',
+ help='whether not to evaluate the checkpoint during training')
+ group_gpus = parser.add_mutually_exclusive_group()
+ group_gpus.add_argument(
+ '--gpus',
+ type=int,
+ help='number of gpus to use '
+ '(only applicable to non-distributed training)')
+ group_gpus.add_argument(
+ '--gpu-ids',
+ type=int,
+ nargs='+',
+ help='ids of gpus to use '
+ '(only applicable to non-distributed training)')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file (deprecate), '
+ 'change to --cfg-options instead.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local-rank', type=int, default=0)
+ parser.add_argument(
+ '--autoscale-lr',
+ action='store_true',
+ help='automatically scale lr with the number of gpus')
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ if args.options and args.cfg_options:
+ raise ValueError(
+ '--options and --cfg-options cannot be both specified, '
+ '--options is deprecated in favor of --cfg-options')
+ if args.options:
+ warnings.warn('--options is deprecated in favor of --cfg-options')
+ args.cfg_options = args.options
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+
+ # set cudnn_benchmark
+ #if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+ # set tf32
+ # if cfg.get('close_tf32', False):
+ # torch.backends.cuda.matmul.allow_tf32 = False
+ # torch.backends.cudnn.allow_tf32 = False
+
+ # work_dir is determined in this priority: CLI > segment in file > filename
+ if args.work_dir is not None:
+ # update configs according to CLI args if args.work_dir is not None
+ cfg.work_dir = args.work_dir
+ elif cfg.get('work_dir', None) is None:
+ # use config filename as default work_dir if cfg.work_dir is None
+ cfg.work_dir = osp.join('./work_dirs',
+ osp.splitext(osp.basename(args.config))[0])
+ # if args.resume_from is not None:
+ if args.resume_from is not None and osp.isfile(args.resume_from):
+ cfg.resume_from = args.resume_from
+ if args.load_from is not None and osp.isfile(args.load_from):
+ cfg.load_from = args.load_from
+ if args.gpu_ids is not None:
+ cfg.gpu_ids = args.gpu_ids
+ else:
+ cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+ if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
+ cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
+ if args.autoscale_lr:
+ # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+ cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ else:
+ distributed = True
+ torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+ init_dist(args.launcher, **cfg.dist_params)
+ # re-set gpu_ids with distributed training mode
+ _, world_size = get_dist_info()
+ cfg.gpu_ids = range(world_size)
+
+ # create work_dir
+ mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+ # dump config
+ cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+ # init the logger before other steps
+ timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+ log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+ # specify logger name, if we still use 'mmdet', the output info will be
+ # filtered and won't be saved in the log_file
+ # TODO: ugly workaround to judge whether we are training det or seg model
+ if cfg.model.type in ['EncoderDecoder3D']:
+ logger_name = 'mmseg'
+ else:
+ logger_name = 'mmdet'
+ logger = get_root_logger(
+ log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+ # init the meta dict to record some important information such as
+ # environment info and seed, which will be logged
+ meta = dict()
+ # log env info
+ env_info_dict = collect_env()
+ env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+ dash_line = '-' * 60 + '\n'
+ logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+ dash_line)
+ meta['env_info'] = env_info
+ meta['config'] = cfg.pretty_text
+
+ # log some basic info
+ logger.info(f'Distributed training: {distributed}')
+ logger.info(f'Config:\n{cfg.pretty_text}')
+
+ # set random seeds
+ if args.seed is not None:
+ logger.info(f'Set random seed to {args.seed}, '
+ f'deterministic: {args.deterministic}')
+ set_random_seed(args.seed, deterministic=args.deterministic)
+ cfg.seed = args.seed
+ meta['seed'] = args.seed
+ meta['exp_name'] = osp.basename(args.config)
+
+ model = build_model(
+ cfg.model,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+ model.init_weights()
+
+ logger.info(f'Model:\n{model}')
+ datasets = [build_dataset(cfg.data.train)]
+ if len(cfg.workflow) == 2:
+ val_dataset = copy.deepcopy(cfg.data.val)
+ # in case we use a dataset wrapper
+ if 'dataset' in cfg.data.train:
+ val_dataset.pipeline = cfg.data.train.dataset.pipeline
+ else:
+ val_dataset.pipeline = cfg.data.train.pipeline
+ # set test_mode=False here in deep copied config
+ # which do not affect AP/AR calculation later
+ # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa
+ val_dataset.test_mode = False
+ datasets.append(build_dataset(val_dataset))
+ if cfg.checkpoint_config is not None:
+ # save mmdet version, config file content and class names in
+ # checkpoints as meta data
+ cfg.checkpoint_config.meta = dict(
+ config=cfg.pretty_text,
+ CLASSES=datasets[0].CLASSES,
+ PALETTE=datasets[0].PALETTE # for segmentors
+ if hasattr(datasets[0], 'PALETTE') else None)
+ # add an attribute for visualization convenience
+ model.CLASSES = datasets[0].CLASSES
+ custom_train_model(
+ model,
+ datasets,
+ cfg,
+ distributed=distributed,
+ validate=(not args.no_validate),
+ timestamp=timestamp,
+ meta=meta)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/uniad/analysis_tools/__init__.py b/adzoo/uniad/analysis_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/uniad/analysis_tools/analyze_logs.py b/adzoo/uniad/analysis_tools/analyze_logs.py
new file mode 100755
index 0000000..806175f
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+ for i, log_dict in enumerate(log_dicts):
+ print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+ all_times = []
+ for epoch in log_dict.keys():
+ if args.include_outliers:
+ all_times.append(log_dict[epoch]['time'])
+ else:
+ all_times.append(log_dict[epoch]['time'][1:])
+ all_times = np.array(all_times)
+ epoch_ave_time = all_times.mean(-1)
+ slowest_epoch = epoch_ave_time.argmax()
+ fastest_epoch = epoch_ave_time.argmin()
+ std_over_epoch = epoch_ave_time.std()
+ print(f'slowest epoch {slowest_epoch + 1}, '
+ f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+ print(f'fastest epoch {fastest_epoch + 1}, '
+ f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+ print(f'time std over epochs is {std_over_epoch:.4f}')
+ print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+ print()
+
+
+def plot_curve(log_dicts, args):
+ if args.backend is not None:
+ plt.switch_backend(args.backend)
+ sns.set_style(args.style)
+ # if legend is None, use {filename}_{key} as legend
+ legend = args.legend
+ if legend is None:
+ legend = []
+ for json_log in args.json_logs:
+ for metric in args.keys:
+ legend.append(f'{json_log}_{metric}')
+ assert len(legend) == (len(args.json_logs) * len(args.keys))
+ metrics = args.keys
+
+ num_metrics = len(metrics)
+ for i, log_dict in enumerate(log_dicts):
+ epochs = list(log_dict.keys())
+ for j, metric in enumerate(metrics):
+ print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+ if metric not in log_dict[epochs[args.interval - 1]]:
+ raise KeyError(
+ f'{args.json_logs[i]} does not contain metric {metric}')
+
+ if args.mode == 'eval':
+ if min(epochs) == args.interval:
+ x0 = args.interval
+ else:
+ # if current training is resumed from previous checkpoint
+ # we lost information in early epochs
+ # `xs` should start according to `min(epochs)`
+ if min(epochs) % args.interval == 0:
+ x0 = min(epochs)
+ else:
+ # find the first epoch that do eval
+ x0 = min(epochs) + args.interval - \
+ min(epochs) % args.interval
+ xs = np.arange(x0, max(epochs) + 1, args.interval)
+ ys = []
+ for epoch in epochs[args.interval - 1::args.interval]:
+ ys += log_dict[epoch][metric]
+
+ # if training is aborted before eval of the last epoch
+ # `xs` and `ys` will have different length and cause an error
+ # check if `ys[-1]` is empty here
+ if not log_dict[epoch][metric]:
+ xs = xs[:-1]
+
+ ax = plt.gca()
+ ax.set_xticks(xs)
+ plt.xlabel('epoch')
+ plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+ else:
+ xs = []
+ ys = []
+ num_iters_per_epoch = \
+ log_dict[epochs[args.interval-1]]['iter'][-1]
+ for epoch in epochs[args.interval - 1::args.interval]:
+ iters = log_dict[epoch]['iter']
+ if log_dict[epoch]['mode'][-1] == 'val':
+ iters = iters[:-1]
+ xs.append(
+ np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+ ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+ xs = np.concatenate(xs)
+ ys = np.concatenate(ys)
+ plt.xlabel('iter')
+ plt.plot(
+ xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+ plt.legend()
+ if args.title is not None:
+ plt.title(args.title)
+ if args.out is None:
+ plt.show()
+ else:
+ print(f'save curve to: {args.out}')
+ plt.savefig(args.out)
+ plt.cla()
+
+
+def add_plot_parser(subparsers):
+ parser_plt = subparsers.add_parser(
+ 'plot_curve', help='parser for plotting curves')
+ parser_plt.add_argument(
+ 'json_logs',
+ type=str,
+ nargs='+',
+ help='path of train log in json format')
+ parser_plt.add_argument(
+ '--keys',
+ type=str,
+ nargs='+',
+ default=['mAP_0.25'],
+ help='the metric that you want to plot')
+ parser_plt.add_argument('--title', type=str, help='title of figure')
+ parser_plt.add_argument(
+ '--legend',
+ type=str,
+ nargs='+',
+ default=None,
+ help='legend of each plot')
+ parser_plt.add_argument(
+ '--backend', type=str, default=None, help='backend of plt')
+ parser_plt.add_argument(
+ '--style', type=str, default='dark', help='style of plt')
+ parser_plt.add_argument('--out', type=str, default=None)
+ parser_plt.add_argument('--mode', type=str, default='train')
+ parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+ parser_time = subparsers.add_parser(
+ 'cal_train_time',
+ help='parser for computing the average time per training iteration')
+ parser_time.add_argument(
+ 'json_logs',
+ type=str,
+ nargs='+',
+ help='path of train log in json format')
+ parser_time.add_argument(
+ '--include-outliers',
+ action='store_true',
+ help='include the first value of every epoch when computing '
+ 'the average time')
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Analyze Json Log')
+ # currently only support plot curve and calculate average train time
+ subparsers = parser.add_subparsers(dest='task', help='task parser')
+ add_plot_parser(subparsers)
+ add_time_parser(subparsers)
+ args = parser.parse_args()
+ return args
+
+
+def load_json_logs(json_logs):
+ # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+ # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+ # value of sub dict is a list of corresponding values of all iterations
+ log_dicts = [dict() for _ in json_logs]
+ for json_log, log_dict in zip(json_logs, log_dicts):
+ with open(json_log, 'r') as log_file:
+ for line in log_file:
+ log = json.loads(line.strip())
+ # skip lines without `epoch` field
+ if 'epoch' not in log:
+ continue
+ epoch = log.pop('epoch')
+ if epoch not in log_dict:
+ log_dict[epoch] = defaultdict(list)
+ for k, v in log.items():
+ log_dict[epoch][k].append(v)
+ return log_dicts
+
+
+def main():
+ args = parse_args()
+
+ json_logs = args.json_logs
+ for json_log in json_logs:
+ assert json_log.endswith('.json')
+
+ log_dicts = load_json_logs(json_logs)
+
+ eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/uniad/analysis_tools/benchmark.py b/adzoo/uniad/analysis_tools/benchmark.py
new file mode 100755
index 0000000..6ed3976
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/benchmark.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from mmcv.datasets.builder import build_dataloader
+from mmcv.datasets import build_dataset
+from mmcv.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+ parser.add_argument('--samples', default=2000, help='samples to benchmark')
+ parser.add_argument(
+ '--log-interval', default=10, help='interval of logging')
+ parser.add_argument(
+ '--fuse-conv-bn',
+ action='store_true',
+ help='Whether to fuse conv and bn, this will slightly increase'
+ 'the inference speed')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+ cfg.model.pretrained = None
+ cfg.data.test.test_mode = True
+
+ # build the dataloader
+ # TODO: support multiple images per gpu (only minor changes are needed)
+ print(cfg.data.test)
+ dataset = build_dataset(cfg.data.test)
+ data_loader = build_dataloader(
+ dataset,
+ samples_per_gpu=1,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=False,
+ shuffle=False)
+
+ # build the model and load checkpoint
+ cfg.model.train_cfg = None
+ model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(model)
+ if args.checkpoint is not None:
+ load_checkpoint(model, args.checkpoint, map_location='cpu')
+ #if args.fuse_conv_bn:
+ # model = fuse_module(model)
+
+ model = MMDataParallel(model, device_ids=[0])
+
+ model.eval()
+
+ # the first several iterations may be very slow so skip them
+ num_warmup = 5
+ pure_inf_time = 0
+
+ # benchmark with several samples and take the average
+ for i, data in enumerate(data_loader):
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ with torch.no_grad():
+ model(return_loss=False, rescale=True, **data)
+
+ torch.cuda.synchronize()
+ elapsed = time.perf_counter() - start_time
+
+ if i >= num_warmup:
+ pure_inf_time += elapsed
+ if (i + 1) % args.log_interval == 0:
+ fps = (i + 1 - num_warmup) / pure_inf_time
+ print(f'Done image [{i + 1:<3}/ {args.samples}], '
+ f'fps: {fps:.1f} img / s')
+
+ if (i + 1) == args.samples:
+ pure_inf_time += elapsed
+ fps = (i + 1 - num_warmup) / pure_inf_time
+ print(f'Overall fps: {fps:.1f} img / s')
+ break
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/uniad/analysis_tools/visualize/render/base_render.py b/adzoo/uniad/analysis_tools/visualize/render/base_render.py
new file mode 100644
index 0000000..65dbbeb
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/render/base_render.py
@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+from pyquaternion import Quaternion
+
+
+class BaseRender:
+ """
+ BaseRender class
+ """
+
+ def __init__(
+ self,
+ figsize=(10, 10)):
+ self.figsize = figsize
+ self.fig, self.axes = None, None
+
+ def reset_canvas(self, dx=1, dy=1, tight_layout=False):
+ plt.close()
+ plt.gca().set_axis_off()
+ plt.axis('off')
+ self.fig, self.axes = plt.subplots(dx, dy, figsize=self.figsize)
+ if tight_layout:
+ plt.tight_layout()
+
+ def close_canvas(self):
+ plt.close()
+
+ def save_fig(self, filename):
+ plt.subplots_adjust(top=1, bottom=0, right=1, left=0,
+ hspace=0, wspace=0)
+ plt.margins(0, 0)
+ print(f'saving to {filename}')
+ plt.savefig(filename)
diff --git a/adzoo/uniad/analysis_tools/visualize/render/bev_render.py b/adzoo/uniad/analysis_tools/visualize/render/bev_render.py
new file mode 100644
index 0000000..fcc6ffa
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/render/bev_render.py
@@ -0,0 +1,264 @@
+import cv2
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from pyquaternion import Quaternion
+from nuscenes.prediction import PredictHelper, convert_local_coords_to_global
+from tools.analysis_tools.visualize.render.base_render import BaseRender
+from tools.analysis_tools.visualize.utils import color_mapping, AgentPredictionData
+
+
+class BEVRender(BaseRender):
+ """
+ Render class for BEV
+ """
+
+ def __init__(self,
+ figsize=(20, 20),
+ margin: float = 50,
+ view: np.ndarray = np.eye(4),
+ show_gt_boxes=False):
+ super(BEVRender, self).__init__(figsize)
+ self.margin = margin
+ self.view = view
+ self.show_gt_boxes = show_gt_boxes
+
+ def set_plot_cfg(self):
+ self.axes.set_xlim([-self.margin, self.margin])
+ self.axes.set_ylim([-self.margin, self.margin])
+ self.axes.set_aspect('equal')
+ self.axes.grid(False)
+
+ def render_sample_data(self, canvas, sample_token):
+ pass
+
+ def render_anno_data(
+ self,
+ sample_token,
+ nusc,
+ predict_helper):
+ sample_record = nusc.get('sample', sample_token)
+ assert 'LIDAR_TOP' in sample_record['data'].keys(
+ ), 'Error: No LIDAR_TOP in data, unable to render.'
+ lidar_record = sample_record['data']['LIDAR_TOP']
+ data_path, boxes, _ = nusc.get_sample_data(
+ lidar_record, selected_anntokens=sample_record['anns'])
+ for box in boxes:
+ instance_token = nusc.get('sample_annotation', box.token)[
+ 'instance_token']
+ future_xy_local = predict_helper.get_future_for_agent(
+ instance_token, sample_token, seconds=6, in_agent_frame=True)
+ if future_xy_local.shape[0] > 0:
+ trans = box.center
+ rot = Quaternion(matrix=box.rotation_matrix)
+ future_xy = convert_local_coords_to_global(
+ future_xy_local, trans, rot)
+ future_xy = np.concatenate(
+ [trans[None, :2], future_xy], axis=0)
+ c = np.array([0, 0.8, 0])
+ box.render(self.axes, view=self.view, colors=(c, c, c))
+ self._render_traj(future_xy, line_color=c, dot_color=(0, 0, 0))
+ self.axes.set_xlim([-self.margin, self.margin])
+ self.axes.set_ylim([-self.margin, self.margin])
+
+ def show_lidar_data(
+ self,
+ sample_token,
+ nusc):
+ sample_record = nusc.get('sample', sample_token)
+ assert 'LIDAR_TOP' in sample_record['data'].keys(
+ ), 'Error: No LIDAR_TOP in data, unable to render.'
+ lidar_record = sample_record['data']['LIDAR_TOP']
+ data_path, boxes, _ = nusc.get_sample_data(
+ lidar_record, selected_anntokens=sample_record['anns'])
+ LidarPointCloud.from_file(data_path).render_height(
+ self.axes, view=self.view)
+ self.axes.set_xlim([-self.margin, self.margin])
+ self.axes.set_ylim([-self.margin, self.margin])
+ self.axes.axis('off')
+ self.axes.set_aspect('equal')
+
+ def render_pred_box_data(self, agent_prediction_list):
+ for pred_agent in agent_prediction_list:
+ c = np.array([0, 1, 0])
+ if hasattr(pred_agent, 'pred_track_id') and pred_agent.pred_track_id is not None: # this is true
+ tr_id = pred_agent.pred_track_id
+ c = color_mapping[tr_id % len(color_mapping)]
+ pred_agent.nusc_box.render(
+ axis=self.axes, view=self.view, colors=(c, c, c))
+ if pred_agent.is_sdc:
+ c = np.array([1, 0, 0])
+ pred_agent.nusc_box.render(
+ axis=self.axes, view=self.view, colors=(c, c, c))
+
+ def render_pred_traj(self, agent_prediction_list, top_k=3):
+ for pred_agent in agent_prediction_list:
+ if pred_agent.is_sdc:
+ continue
+ sorted_ind = np.argsort(pred_agent.pred_traj_score)[
+ ::-1] # from high to low
+ num_modes = len(sorted_ind)
+ sorted_traj = pred_agent.pred_traj[sorted_ind, :, :2]
+ sorted_score = pred_agent.pred_traj_score[sorted_ind]
+ # norm_score = np.sum(np.exp(sorted_score))
+ norm_score = np.exp(sorted_score[0])
+
+ sorted_traj = np.concatenate(
+ [np.zeros((num_modes, 1, 2)), sorted_traj], axis=1)
+ trans = pred_agent.pred_center
+ rot = Quaternion(axis=np.array([0, 0.0, 1.0]), angle=np.pi/2)
+ vehicle_id_list = [0, 1, 2, 3, 4, 6, 7]
+ if pred_agent.pred_label in vehicle_id_list:
+ dot_size = 150
+ else:
+ dot_size = 25
+ # print(sorted_score)
+ for i in range(top_k-1, -1, -1):
+ viz_traj = sorted_traj[i, :, :2]
+ viz_traj = convert_local_coords_to_global(viz_traj, trans, rot)
+ traj_score = np.exp(sorted_score[i])/norm_score
+ # traj_score = [1.0, 0.01, 0.01, 0.01, 0.01, 0.01][i]
+ self._render_traj(viz_traj, traj_score=traj_score,
+ colormap='winter', dot_size=dot_size)
+
+ def render_pred_map_data(self, predicted_map_seg):
+ # rendered_map = map_color_dict
+ # divider, crossing, contour
+ map_color_dict = np.array(
+ [(204, 128, 0), (102, 255, 102), (102, 255, 102)])
+ rendered_map = map_color_dict[predicted_map_seg.argmax(
+ -1).reshape(-1)].reshape(200, 200, -1)
+ bg_mask = predicted_map_seg.sum(-1) == 0
+ rendered_map[bg_mask, :] = 255
+ self.axes.imshow(rendered_map, alpha=0.6,
+ interpolation='nearest', extent=(-51.2, 51.2, -51.2, 51.2))
+
+ def render_occ_map_data(self, agent_list):
+ rendered_map = np.ones((200, 200, 3))
+ rendered_map_hsv = matplotlib.colors.rgb_to_hsv(rendered_map)
+ occ_prob_map = np.zeros((200, 200))
+ for i in range(len(agent_list)):
+ pred_agent = agent_list[i]
+ if pred_agent.pred_occ_map is None:
+ continue
+ if hasattr(pred_agent, 'pred_track_id') and pred_agent.pred_track_id is not None: # this is true
+ tr_id = pred_agent.pred_track_id
+ c = color_mapping[tr_id % len(color_mapping)]
+ pred_occ_map = pred_agent.pred_occ_map.max(0)
+ update_mask = pred_occ_map > occ_prob_map
+ occ_prob_map[update_mask] = pred_occ_map[update_mask]
+ pred_occ_map *= update_mask
+ hsv_c = matplotlib.colors.rgb_to_hsv(c)
+ rendered_map_hsv[pred_occ_map > 0.1] = (
+ np.ones((200, 200, 1)) * hsv_c)[pred_occ_map > 0.1]
+ max_prob = pred_occ_map.max()
+ renorm_pred_occ_map = (pred_occ_map - max_prob) * 0.7 + 1
+ sat_map = (renorm_pred_occ_map * hsv_c[1])
+ rendered_map_hsv[pred_occ_map > 0.1,
+ 1] = sat_map[pred_occ_map > 0.1]
+ rendered_map = matplotlib.colors.hsv_to_rgb(rendered_map_hsv)
+ self.axes.imshow(rendered_map, alpha=0.8,
+ interpolation='nearest', extent=(-50, 50, -50, 50))
+
+ def render_occ_map_data_time(self, agent_list, t):
+ rendered_map = np.ones((200, 200, 3))
+ rendered_map_hsv = matplotlib.colors.rgb_to_hsv(rendered_map)
+ occ_prob_map = np.zeros((200, 200))
+ for i in range(len(agent_list)):
+ pred_agent = agent_list[i]
+ if pred_agent.pred_occ_map is None:
+ continue
+ if hasattr(pred_agent, 'pred_track_id') and pred_agent.pred_track_id is not None: # this is true
+ tr_id = pred_agent.pred_track_id
+ c = color_mapping[tr_id % len(color_mapping)]
+ pred_occ_map = pred_agent.pred_occ_map[t]
+ update_mask = pred_occ_map > occ_prob_map
+ occ_prob_map[update_mask] = pred_occ_map[update_mask]
+ pred_occ_map *= update_mask
+ hsv_c = matplotlib.colors.rgb_to_hsv(c)
+ rendered_map_hsv[pred_occ_map > 0.1] = (
+ np.ones((200, 200, 1)) * hsv_c)[pred_occ_map > 0.1]
+ max_prob = pred_occ_map.max()
+ renorm_pred_occ_map = (pred_occ_map - max_prob) * 0.7 + 1
+ sat_map = (renorm_pred_occ_map * hsv_c[1])
+ rendered_map_hsv[pred_occ_map > 0.1,
+ 1] = sat_map[pred_occ_map > 0.1]
+ rendered_map = matplotlib.colors.hsv_to_rgb(rendered_map_hsv)
+ self.axes.imshow(rendered_map, alpha=0.8,
+ interpolation='nearest', extent=(-50, 50, -50, 50))
+
+ def render_planning_data(self, predicted_planning, show_command=False):
+ planning_traj = predicted_planning.pred_traj
+ planning_traj = np.concatenate(
+ [np.zeros((1, 2)), planning_traj], axis=0)
+ self._render_traj(planning_traj, colormap='autumn', dot_size=50)
+ if show_command:
+ self._render_command(predicted_planning.command)
+
+ def render_planning_attn_mask(self, predicted_planning):
+ planning_attn_mask = predicted_planning.attn_mask
+ planning_attn_mask = planning_attn_mask/planning_attn_mask.max()
+ cmap_name = 'plasma'
+ self.axes.imshow(planning_attn_mask, alpha=0.8, interpolation='nearest', extent=(
+ -51.2, 51.2, -51.2, 51.2), vmax=0.2, cmap=matplotlib.colormaps[cmap_name])
+
+ def render_hd_map(self, nusc, nusc_maps, sample_token):
+ sample_record = nusc.get('sample', sample_token)
+ sd_rec = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ info = {
+ 'lidar2ego_translation': cs_record['translation'],
+ 'lidar2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'scene_token': sample_record['scene_token']
+ }
+
+ layer_names = ['road_divider', 'road_segment', 'lane_divider',
+ 'lane', 'road_divider', 'traffic_light', 'ped_crossing']
+ map_mask = obtain_map_info(nusc,
+ nusc_maps,
+ info,
+ patch_size=(102.4, 102.4),
+ canvas_size=(1024, 1024),
+ layer_names=layer_names)
+ map_mask = np.flip(map_mask, axis=1)
+ map_mask = np.rot90(map_mask, k=-1, axes=(1, 2))
+ map_mask = map_mask[:, ::-1] > 0
+ map_show = np.ones((1024, 1024, 3))
+ map_show[map_mask[0], :] = np.array([1.00, 0.50, 0.31])
+ map_show[map_mask[1], :] = np.array([159./255., 0.0, 1.0])
+ self.axes.imshow(map_show, alpha=0.2, interpolation='nearest',
+ extent=(-51.2, 51.2, -51.2, 51.2))
+
+ def _render_traj(self, future_traj, traj_score=1, colormap='winter', points_per_step=20, line_color=None, dot_color=None, dot_size=25):
+ total_steps = (len(future_traj)-1) * points_per_step + 1
+ dot_colors = matplotlib.colormaps[colormap](
+ np.linspace(0, 1, total_steps))[:, :3]
+ dot_colors = dot_colors*traj_score + \
+ (1-traj_score)*np.ones_like(dot_colors)
+ total_xy = np.zeros((total_steps, 2))
+ for i in range(total_steps-1):
+ unit_vec = future_traj[i//points_per_step +
+ 1] - future_traj[i//points_per_step]
+ total_xy[i] = (i/points_per_step - i//points_per_step) * \
+ unit_vec + future_traj[i//points_per_step]
+ total_xy[-1] = future_traj[-1]
+ self.axes.scatter(
+ total_xy[:, 0], total_xy[:, 1], c=dot_colors, s=dot_size)
+
+ def _render_command(self, command):
+ command_dict = ['TURN RIGHT', 'TURN LEFT', 'KEEP FORWARD']
+ self.axes.text(-48, -45, command_dict[int(command)], fontsize=45)
+
+ def render_sdc_car(self):
+ sdc_car_png = cv2.imread('sources/sdc_car.png')
+ sdc_car_png = cv2.cvtColor(sdc_car_png, cv2.COLOR_BGR2RGB)
+ self.axes.imshow(sdc_car_png, extent=(-1, 1, -2, 2))
+
+ def render_legend(self):
+ legend = cv2.imread('sources/legend.png')
+ legend = cv2.cvtColor(legend, cv2.COLOR_BGR2RGB)
+ self.axes.imshow(legend, extent=(23, 51.2, -50, -40))
diff --git a/adzoo/uniad/analysis_tools/visualize/render/cam_render.py b/adzoo/uniad/analysis_tools/visualize/render/cam_render.py
new file mode 100644
index 0000000..c2646b1
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/render/cam_render.py
@@ -0,0 +1,202 @@
+import cv2
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from tools.analysis_tools.visualize.utils import color_mapping, AgentPredictionData
+from tools.analysis_tools.visualize.render.base_render import BaseRender
+from pyquaternion import Quaternion
+
+# Define a constant for camera names
+CAM_NAMES = [
+ 'CAM_FRONT_LEFT',
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+]
+
+
+class CameraRender(BaseRender):
+ """
+ Render class for Camera View
+ """
+
+ def __init__(self,
+ figsize=(53.3333, 20),
+ show_gt_boxes=False):
+ super().__init__(figsize)
+ self.cams = CAM_NAMES
+ self.show_gt_boxes = show_gt_boxes
+
+ def get_axis(self, index):
+ """Retrieve the corresponding axis based on the index."""
+ return self.axes[index//3, index % 3]
+
+ def project_to_cam(self,
+ agent_prediction_list,
+ sample_data_token,
+ nusc,
+ lidar_cs_record,
+ project_traj=False,
+ cam=None,
+ ):
+ """Project predictions to camera view."""
+ _, cs_record, pose_record, cam_intrinsic, imsize = self.get_image_info(
+ sample_data_token, nusc)
+ boxes = []
+ for agent in agent_prediction_list:
+ box = Box(agent.pred_center, agent.pred_dim, Quaternion(axis=(0.0, 0.0, 1.0), radians=agent.pred_yaw),
+ name=agent.pred_label, token='predicted')
+ box.is_sdc = agent.is_sdc
+ if project_traj:
+ box.pred_traj = np.zeros((agent.pred_traj_max.shape[0]+1, 3))
+ box.pred_traj[:, 0] = agent.pred_center[0]
+ box.pred_traj[:, 1] = agent.pred_center[1]
+ box.pred_traj[:, 2] = agent.pred_center[2] - \
+ agent.pred_dim[2]/2
+ box.pred_traj[1:, :2] += agent.pred_traj_max[:, :2]
+ box.pred_traj = (Quaternion(
+ lidar_cs_record['rotation']).rotation_matrix @ box.pred_traj.T).T
+ box.pred_traj += np.array(
+ lidar_cs_record['translation'])[None, :]
+ box.rotate(Quaternion(lidar_cs_record['rotation']))
+ box.translate(np.array(lidar_cs_record['translation']))
+ boxes.append(box)
+ # Make list of Box objects including coord system transforms.
+
+ box_list = []
+ tr_id_list = []
+ for i, box in enumerate(boxes):
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+ if project_traj:
+ box.pred_traj += -np.array(cs_record['translation'])[None, :]
+ box.pred_traj = (Quaternion(
+ cs_record['rotation']).inverse.rotation_matrix @ box.pred_traj.T).T
+
+ tr_id = agent_prediction_list[i].pred_track_id
+ if box.is_sdc and cam == 'CAM_FRONT':
+ box_list.append(box)
+ if not box_in_image(box, cam_intrinsic, imsize):
+ continue
+ box_list.append(box)
+ tr_id_list.append(tr_id)
+ return box_list, tr_id_list, cam_intrinsic, imsize
+
+ def render_image_data(self, sample_token, nusc):
+ """Load and annotate image based on the provided path."""
+ sample = nusc.get('sample', sample_token)
+ for i, cam in enumerate(self.cams):
+ sample_data_token = sample['data'][cam]
+ data_path, _, _, _, _ = self.get_image_info(
+ sample_data_token, nusc)
+ image = self.load_image(data_path, cam)
+ self.update_image(image, i, cam)
+
+ def load_image(self, data_path, cam):
+ """Update the axis of the plot with the provided image."""
+ image = np.array(Image.open(data_path))
+ font = cv2.FONT_HERSHEY_SIMPLEX
+ org = (50, 60)
+ fontScale = 2
+ color = (0, 0, 0)
+ thickness = 4
+ return cv2.putText(image, cam, org, font, fontScale, color, thickness, cv2.LINE_AA)
+
+ def update_image(self, image, index, cam):
+ """Render image data for each camera."""
+ ax = self.get_axis(index)
+ ax.imshow(image)
+ plt.axis('off')
+ ax.axis('off')
+ ax.grid(False)
+
+ def render_pred_track_bbox(self, predicted_agent_list, sample_token, nusc):
+ """Render bounding box for predicted tracks."""
+ sample = nusc.get('sample', sample_token)
+ lidar_cs_record = nusc.get('calibrated_sensor', nusc.get(
+ 'sample_data', sample['data']['LIDAR_TOP'])['calibrated_sensor_token'])
+ for i, cam in enumerate(self.cams):
+ sample_data_token = sample['data'][cam]
+ box_list, tr_id_list, camera_intrinsic, imsize = self.project_to_cam(
+ predicted_agent_list, sample_data_token, nusc, lidar_cs_record)
+ for j, box in enumerate(box_list):
+ if box.is_sdc:
+ continue
+ tr_id = tr_id_list[j]
+ if tr_id is None:
+ tr_id = 0
+ c = color_mapping[tr_id % len(color_mapping)]
+ box.render(
+ self.axes[i//3, i % 3], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+ # plot gt
+ if self.show_gt_boxes:
+ data_path, boxes, camera_intrinsic = nusc.get_sample_data(
+ sample_data_token, selected_anntokens=sample['anns'])
+ for j, box in enumerate(boxes):
+ c = [0, 1, 0]
+ box.render(
+ self.axes[i//3, i % 3], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+ self.axes[i//3, i % 3].set_xlim(0, imsize[0])
+ self.axes[i//3, i % 3].set_ylim(imsize[1], 0)
+
+ def render_pred_traj(self, predicted_agent_list, sample_token, nusc, render_sdc=False, points_per_step=10):
+ """Render predicted trajectories."""
+ sample = nusc.get('sample', sample_token)
+ lidar_cs_record = nusc.get('calibrated_sensor', nusc.get(
+ 'sample_data', sample['data']['LIDAR_TOP'])['calibrated_sensor_token'])
+ for i, cam in enumerate(self.cams):
+ sample_data_token = sample['data'][cam]
+ box_list, tr_id_list, camera_intrinsic, imsize = self.project_to_cam(
+ predicted_agent_list, sample_data_token, nusc, lidar_cs_record, project_traj=True, cam=cam)
+ for j, box in enumerate(box_list):
+ traj_points = box.pred_traj[:, :3]
+
+ total_steps = (len(traj_points)-1) * points_per_step + 1
+ total_xy = np.zeros((total_steps, 3))
+ for k in range(total_steps-1):
+ unit_vec = traj_points[k//points_per_step +
+ 1] - traj_points[k//points_per_step]
+ total_xy[k] = (k/points_per_step - k//points_per_step) * \
+ unit_vec + traj_points[k//points_per_step]
+ in_range_mask = total_xy[:, 2] > 0.1
+ traj_points = view_points(
+ total_xy.T, camera_intrinsic, normalize=True)[:2, :]
+ traj_points = traj_points[:2, in_range_mask]
+ if box.is_sdc:
+ if render_sdc:
+ self.axes[i//3, i % 3].scatter(
+ traj_points[0], traj_points[1], color=(1, 0.5, 0), s=150)
+ else:
+ continue
+ else:
+ tr_id = tr_id_list[j]
+ if tr_id is None:
+ tr_id = 0
+ c = color_mapping[tr_id % len(color_mapping)]
+ self.axes[i//3, i %
+ 3].scatter(traj_points[0], traj_points[1], color=c, s=15)
+ self.axes[i//3, i % 3].set_xlim(0, imsize[0])
+ self.axes[i//3, i % 3].set_ylim(imsize[1], 0)
+
+ def get_image_info(self, sample_data_token, nusc):
+ """Retrieve image information."""
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor',
+ sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+ data_path = nusc.get_sample_data_path(sample_data_token)
+
+ if sensor_record['modality'] == 'camera':
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ else:
+ cam_intrinsic = None
+ imsize = None
+ return data_path, cs_record, pose_record, cam_intrinsic, imsize
diff --git a/adzoo/uniad/analysis_tools/visualize/run.py b/adzoo/uniad/analysis_tools/visualize/run.py
new file mode 100644
index 0000000..b64b545
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/run.py
@@ -0,0 +1,338 @@
+import cv2
+import torch
+import argparse
+import os
+import glob
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from nuscenes import NuScenes
+from nuscenes.prediction import PredictHelper, convert_local_coords_to_global
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.utils import splits
+from pyquaternion import Quaternion
+from mmcv.datasets.nuscenes_e2e_dataset import obtain_map_info
+from mmcv.datasets.eval_utils.map_api import NuScenesMap
+from mmcv.fileio.io import load
+from PIL import Image
+from tools.analysis_tools.visualize.utils import color_mapping, AgentPredictionData
+from tools.analysis_tools.visualize.render.bev_render import BEVRender
+from tools.analysis_tools.visualize.render.cam_render import CameraRender
+
+
+class Visualizer:
+ """
+ BaseRender class
+ """
+
+ def __init__(
+ self,
+ dataroot='/mnt/petrelfs/yangjiazhi/e2e_proj/data/nus_mini',
+ version='v1.0-mini',
+ predroot=None,
+ with_occ_map=False,
+ with_map=False,
+ with_planning=False,
+ with_pred_box=True,
+ with_pred_traj=False,
+ show_gt_boxes=False,
+ show_lidar=False,
+ show_command=False,
+ show_hd_map=False,
+ show_sdc_car=False,
+ show_sdc_traj=False,
+ show_legend=False):
+ self.nusc = NuScenes(version=version, dataroot=dataroot, verbose=True)
+ self.predict_helper = PredictHelper(self.nusc)
+ self.with_occ_map = with_occ_map
+ self.with_map = with_map
+ self.with_planning = with_planning
+ self.show_lidar = show_lidar
+ self.show_command = show_command
+ self.show_hd_map = show_hd_map
+ self.show_sdc_car = show_sdc_car
+ self.show_sdc_traj = show_sdc_traj
+ self.show_legend = show_legend
+ self.with_pred_traj = with_pred_traj
+ self.with_pred_box = with_pred_box
+ self.veh_id_list = [0, 1, 2, 3, 4, 6, 7]
+ self.use_json = '.json' in predroot
+ self.token_set = set()
+ self.predictions = self._parse_predictions_multitask_pkl(predroot)
+ self.bev_render = BEVRender(show_gt_boxes=show_gt_boxes)
+ self.cam_render = CameraRender(show_gt_boxes=show_gt_boxes)
+
+ if self.show_hd_map:
+ self.nusc_maps = {
+ 'boston-seaport': NuScenesMap(dataroot=dataroot, map_name='boston-seaport'),
+ 'singapore-hollandvillage': NuScenesMap(dataroot=dataroot, map_name='singapore-hollandvillage'),
+ 'singapore-onenorth': NuScenesMap(dataroot=dataroot, map_name='singapore-onenorth'),
+ 'singapore-queenstown': NuScenesMap(dataroot=dataroot, map_name='singapore-queenstown'),
+ }
+
+ def _parse_predictions_multitask_pkl(self, predroot):
+
+ outputs = load(predroot)
+ outputs = outputs['bbox_results']
+ prediction_dict = dict()
+ for k in range(len(outputs)):
+ token = outputs[k]['token']
+ self.token_set.add(token)
+ if self.show_sdc_traj:
+ outputs[k]['boxes_3d'].tensor = torch.cat(
+ [outputs[k]['boxes_3d'].tensor, outputs[k]['sdc_boxes_3d'].tensor], dim=0)
+ outputs[k]['scores_3d'] = torch.cat(
+ [outputs[k]['scores_3d'], outputs[k]['sdc_scores_3d']], dim=0)
+ outputs[k]['labels_3d'] = torch.cat([outputs[k]['labels_3d'], torch.zeros(
+ (1,), device=outputs[k]['labels_3d'].device)], dim=0)
+ # detection
+ bboxes = outputs[k]['boxes_3d']
+ scores = outputs[k]['scores_3d']
+ labels = outputs[k]['labels_3d']
+
+ track_scores = scores.cpu().detach().numpy()
+ track_labels = labels.cpu().detach().numpy()
+ track_boxes = bboxes.tensor.cpu().detach().numpy()
+
+ track_centers = bboxes.gravity_center.cpu().detach().numpy()
+ track_dims = bboxes.dims.cpu().detach().numpy()
+ track_yaw = bboxes.yaw.cpu().detach().numpy()
+
+ if 'track_ids' in outputs[k]:
+ track_ids = outputs[k]['track_ids'].cpu().detach().numpy()
+ else:
+ track_ids = None
+
+ # speed
+ track_velocity = bboxes.tensor.cpu().detach().numpy()[:, -2:]
+
+ # trajectories
+ trajs = outputs[k][f'traj'].numpy()
+ traj_scores = outputs[k][f'traj_scores'].numpy()
+
+ predicted_agent_list = []
+
+ # occflow
+ if self.with_occ_map:
+ if 'topk_query_ins_segs' in outputs[k]['occ']:
+ occ_map = outputs[k]['occ']['topk_query_ins_segs'][0].cpu(
+ ).numpy()
+ else:
+ occ_map = np.zeros((1, 5, 200, 200))
+ else:
+ occ_map = None
+
+ occ_idx = 0
+ for i in range(track_scores.shape[0]):
+ if track_scores[i] < 0.25:
+ continue
+ if occ_map is not None and track_labels[i] in self.veh_id_list:
+ occ_map_cur = occ_map[occ_idx, :, ::-1]
+ occ_idx += 1
+ else:
+ occ_map_cur = None
+ if track_ids is not None:
+ if i < len(track_ids):
+ track_id = track_ids[i]
+ else:
+ track_id = 0
+ else:
+ track_id = None
+ # if track_labels[i] not in [0, 1, 2, 3, 4, 6, 7]:
+ # continue
+ predicted_agent_list.append(
+ AgentPredictionData(
+ track_scores[i],
+ track_labels[i],
+ track_centers[i],
+ track_dims[i],
+ track_yaw[i],
+ track_velocity[i],
+ trajs[i],
+ traj_scores[i],
+ pred_track_id=track_id,
+ pred_occ_map=occ_map_cur,
+ past_pred_traj=None
+ )
+ )
+
+ if self.with_map:
+ map_thres = 0.7
+ score_list = outputs[k]['pts_bbox']['score_list'].cpu().numpy().transpose([
+ 1, 2, 0])
+ predicted_map_seg = outputs[k]['pts_bbox']['lane_score'].cpu().numpy().transpose([
+ 1, 2, 0]) # H, W, C
+ predicted_map_seg[..., -1] = score_list[..., -1]
+ predicted_map_seg = (predicted_map_seg > map_thres) * 1.0
+ predicted_map_seg = predicted_map_seg[::-1, :, :]
+ else:
+ predicted_map_seg = None
+
+ if self.with_planning:
+ # detection
+ bboxes = outputs[k]['sdc_boxes_3d']
+ scores = outputs[k]['sdc_scores_3d']
+ labels = 0
+
+ track_scores = scores.cpu().detach().numpy()
+ track_labels = labels
+ track_boxes = bboxes.tensor.cpu().detach().numpy()
+
+ track_centers = bboxes.gravity_center.cpu().detach().numpy()
+ track_dims = bboxes.dims.cpu().detach().numpy()
+ track_yaw = bboxes.yaw.cpu().detach().numpy()
+ track_velocity = bboxes.tensor.cpu().detach().numpy()[:, -2:]
+
+ if self.show_command:
+ command = outputs[k]['command'][0].cpu().detach().numpy()
+ else:
+ command = None
+ planning_agent = AgentPredictionData(
+ track_scores[0],
+ track_labels,
+ track_centers[0],
+ track_dims[0],
+ track_yaw[0],
+ track_velocity[0],
+ outputs[k]['planning_traj'][0].cpu().detach().numpy(),
+ 1,
+ pred_track_id=-1,
+ pred_occ_map=None,
+ past_pred_traj=None,
+ is_sdc=True,
+ command=command,
+ )
+ predicted_agent_list.append(planning_agent)
+ else:
+ planning_agent = None
+ prediction_dict[token] = dict(predicted_agent_list=predicted_agent_list,
+ predicted_map_seg=predicted_map_seg,
+ predicted_planning=planning_agent)
+ return prediction_dict
+
+ def visualize_bev(self, sample_token, out_filename, t=None):
+ self.bev_render.reset_canvas(dx=1, dy=1)
+ self.bev_render.set_plot_cfg()
+
+ if self.show_lidar:
+ self.bev_render.show_lidar_data(sample_token, self.nusc)
+ if self.bev_render.show_gt_boxes:
+ self.bev_render.render_anno_data(
+ sample_token, self.nusc, self.predict_helper)
+ if self.with_pred_box:
+ self.bev_render.render_pred_box_data(
+ self.predictions[sample_token]['predicted_agent_list'])
+ if self.with_pred_traj:
+ self.bev_render.render_pred_traj(
+ self.predictions[sample_token]['predicted_agent_list'])
+ if self.with_map:
+ self.bev_render.render_pred_map_data(
+ self.predictions[sample_token]['predicted_map_seg'])
+ if self.with_occ_map:
+ self.bev_render.render_occ_map_data(
+ self.predictions[sample_token]['predicted_agent_list'])
+ if self.with_planning:
+ self.bev_render.render_pred_box_data(
+ [self.predictions[sample_token]['predicted_planning']])
+ self.bev_render.render_planning_data(
+ self.predictions[sample_token]['predicted_planning'], show_command=self.show_command)
+ if self.show_hd_map:
+ self.bev_render.render_hd_map(
+ self.nusc, self.nusc_maps, sample_token)
+ if self.show_sdc_car:
+ self.bev_render.render_sdc_car()
+ if self.show_legend:
+ self.bev_render.render_legend()
+ self.bev_render.save_fig(out_filename + '.jpg')
+
+ def visualize_cam(self, sample_token, out_filename):
+ self.cam_render.reset_canvas(dx=2, dy=3, tight_layout=True)
+ self.cam_render.render_image_data(sample_token, self.nusc)
+ self.cam_render.render_pred_track_bbox(
+ self.predictions[sample_token]['predicted_agent_list'], sample_token, self.nusc)
+ self.cam_render.render_pred_traj(
+ self.predictions[sample_token]['predicted_agent_list'], sample_token, self.nusc, render_sdc=self.with_planning)
+ self.cam_render.save_fig(out_filename + '_cam.jpg')
+
+ def combine(self, out_filename):
+ # pass
+ bev_image = cv2.imread(out_filename + '.jpg')
+ cam_image = cv2.imread(out_filename + '_cam.jpg')
+ merge_image = cv2.hconcat([cam_image, bev_image])
+ cv2.imwrite(out_filename + '.jpg', merge_image)
+ os.remove(out_filename + '_cam.jpg')
+
+ def to_video(self, folder_path, out_path, fps=4, downsample=1):
+ imgs_path = glob.glob(os.path.join(folder_path, '*.jpg'))
+ imgs_path = sorted(imgs_path)
+ img_array = []
+ for img_path in imgs_path:
+ img = cv2.imread(img_path)
+ height, width, channel = img.shape
+ img = cv2.resize(img, (width//downsample, height //
+ downsample), interpolation=cv2.INTER_AREA)
+ height, width, channel = img.shape
+ size = (width, height)
+ img_array.append(img)
+ out = cv2.VideoWriter(
+ out_path, cv2.VideoWriter_fourcc(*'DIVX'), fps, size)
+ for i in range(len(img_array)):
+ out.write(img_array[i])
+ out.release()
+
+def main(args):
+ render_cfg = dict(
+ with_occ_map=False,
+ with_map=False,
+ with_planning=True,
+ with_pred_box=True,
+ with_pred_traj=True,
+ show_gt_boxes=False,
+ show_lidar=False,
+ show_command=True,
+ show_hd_map=False,
+ show_sdc_car=True,
+ show_legend=True,
+ show_sdc_traj=False
+ )
+
+ viser = Visualizer(version='v1.0-mini', predroot=args.predroot, dataroot='data/nuscenes', **render_cfg)
+
+ if not os.path.exists(args.out_folder):
+ os.makedirs(args.out_folder)
+
+ val_splits = splits.val
+
+ scene_token_to_name = dict()
+ for i in range(len(viser.nusc.scene)):
+ scene_token_to_name[viser.nusc.scene[i]['token']] = viser.nusc.scene[i]['name']
+
+ for i in range(len(viser.nusc.sample)):
+ sample_token = viser.nusc.sample[i]['token']
+ scene_token = viser.nusc.sample[i]['scene_token']
+
+ if scene_token_to_name[scene_token] not in val_splits:
+ continue
+
+ if sample_token not in viser.token_set:
+ print(i, sample_token, 'not in prediction pkl!')
+ continue
+
+ viser.visualize_bev(sample_token, os.path.join(args.out_folder, str(i).zfill(3)))
+
+ if args.project_to_cam:
+ viser.visualize_cam(sample_token, os.path.join(args.out_folder, str(i).zfill(3)))
+ viser.combine(os.path.join(args.out_folder, str(i).zfill(3)))
+
+ viser.to_video(args.out_folder, args.demo_video, fps=4, downsample=2)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--predroot', default='/mnt/nas20/yihan01.hu/tmp/results.pkl', help='Path to results.pkl')
+ parser.add_argument('--out_folder', default='/mnt/nas20/yihan01.hu/tmp/viz/demo_test/', help='Output folder path')
+ parser.add_argument('--demo_video', default='mini_val_final.avi', help='Demo video name')
+ parser.add_argument('--project_to_cam', default=True, help='Project to cam (default: True)')
+ args = parser.parse_args()
+ main(args)
diff --git a/adzoo/uniad/analysis_tools/visualize/utils.py b/adzoo/uniad/analysis_tools/visualize/utils.py
new file mode 100644
index 0000000..315344e
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/utils.py
@@ -0,0 +1,131 @@
+import numpy as np
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from pyquaternion import Quaternion
+
+
+color_mapping = np.asarray([
+ [0, 0, 0],
+ [255, 179, 0],
+ [128, 62, 117],
+ [255, 104, 0],
+ [166, 189, 215],
+ [193, 0, 32],
+ [206, 162, 98],
+ [129, 112, 102],
+ [0, 125, 52],
+ [246, 118, 142],
+ [0, 83, 138],
+ [255, 122, 92],
+ [83, 55, 122],
+ [255, 142, 0],
+ [179, 40, 81],
+ [244, 200, 0],
+ [127, 24, 13],
+ [147, 170, 0],
+ [89, 51, 21],
+ [241, 58, 19],
+ [35, 44, 22],
+ [112, 224, 255],
+ [70, 184, 160],
+ [153, 0, 255],
+ [71, 255, 0],
+ [255, 0, 163],
+ [255, 204, 0],
+ [0, 255, 235],
+ [255, 0, 235],
+ [255, 0, 122],
+ [255, 245, 0],
+ [10, 190, 212],
+ [214, 255, 0],
+ [0, 204, 255],
+ [20, 0, 255],
+ [255, 255, 0],
+ [0, 153, 255],
+ [0, 255, 204],
+ [41, 255, 0],
+ [173, 0, 255],
+ [0, 245, 255],
+ [71, 0, 255],
+ [0, 255, 184],
+ [0, 92, 255],
+ [184, 255, 0],
+ [255, 214, 0],
+ [25, 194, 194],
+ [92, 0, 255],
+ [220, 220, 220],
+ [255, 9, 92],
+ [112, 9, 255],
+ [8, 255, 214],
+ [255, 184, 6],
+ [10, 255, 71],
+ [255, 41, 10],
+ [7, 255, 255],
+ [224, 255, 8],
+ [102, 8, 255],
+ [255, 61, 6],
+ [255, 194, 7],
+ [0, 255, 20],
+ [255, 8, 41],
+ [255, 5, 153],
+ [6, 51, 255],
+ [235, 12, 255],
+ [160, 150, 20],
+ [0, 163, 255],
+ [140, 140, 140],
+ [250, 10, 15],
+ [20, 255, 0],
+])/255
+
+
+class AgentPredictionData:
+ """
+ Agent data class, includes bbox, traj, and occflow
+ """
+
+ def __init__(self,
+ pred_score,
+ pred_label,
+ pred_center,
+ pred_dim,
+ pred_yaw,
+ pred_vel,
+ pred_traj,
+ pred_traj_score,
+ pred_track_id=None,
+ pred_occ_map=None,
+ is_sdc=False,
+ past_pred_traj=None,
+ command=None,
+ attn_mask=None,
+ ):
+ self.pred_score = pred_score
+ self.pred_label = pred_label
+ self.pred_center = pred_center
+ self.pred_dim = pred_dim
+ self.pred_yaw = -pred_yaw-np.pi/2
+ self.pred_vel = pred_vel
+ self.pred_traj = pred_traj
+ self.pred_traj_score = pred_traj_score
+ self.pred_track_id = pred_track_id
+ self.pred_occ_map = pred_occ_map
+ if self.pred_traj is not None:
+ if isinstance(self.pred_traj_score, int):
+ self.pred_traj_max = self.pred_traj
+ else:
+ self.pred_traj_max = self.pred_traj[self.pred_traj_score.argmax(
+ )]
+ else:
+ self.pred_traj_max = None
+ self.nusc_box = Box(
+ center=pred_center,
+ size=pred_dim,
+ orientation=Quaternion(axis=[0, 0, 1], radians=self.pred_yaw),
+ label=pred_label,
+ score=pred_score
+ )
+ if is_sdc:
+ self.pred_center = [0, 0, -1.2+1.56/2]
+ self.is_sdc = is_sdc
+ self.past_pred_traj = past_pred_traj
+ self.command = command
+ self.attn_mask = attn_mask
diff --git a/adzoo/uniad/configs/_base_/datasets/nus-3d.py b/adzoo/uniad/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000..1548171
--- /dev/null
+++ b/adzoo/uniad/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/uniad/configs/_base_/default_runtime.py b/adzoo/uniad/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..4e85b69
--- /dev/null
+++ b/adzoo/uniad/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/adzoo/uniad/configs/stage1_track_map/base_track_map.py b/adzoo/uniad/configs/stage1_track_map/base_track_map.py
new file mode 100644
index 0000000..cd18640
--- /dev/null
+++ b/adzoo/uniad/configs/stage1_track_map/base_track_map.py
@@ -0,0 +1,580 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+ "../_base_/default_runtime.py"]
+
+# Update-2023-06-12:
+# [Enhance] Update some freezing args of UniAD
+# [Bugfix] Reproduce the from-scratch results of stage1
+# 1. Remove loss_past_traj in stage1 training
+# 2. Unfreeze neck and BN
+# --> Reproduced tracking result: AMOTA 0.393
+
+
+# Unfreeze neck and BN, the from-scratch results of stage1 could be reproduced
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+ "car",
+ "truck",
+ "construction_vehicle",
+ "bus",
+ "trailer",
+ "barrier",
+ "motorcycle",
+ "bicycle",
+ "pedestrian",
+ "traffic_cone",
+]
+
+input_modality = dict(
+ use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+
+# NOTE: You can change queue_length from 5 to 3 to save GPU memory, but at risk of performance drop.
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting
+occ_n_future = 4
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ###
+occflow_grid_conf = {
+ 'xbound': [-50.0, 50.0, 0.5],
+ 'ybound': [-50.0, 50.0, 0.5],
+ 'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+ type="UniAD",
+ gt_iou_threshold=train_gt_iou_threshold,
+ queue_length=queue_length,
+ use_grid_mask=True,
+ video_test_mode=True,
+ num_query=900,
+ num_classes=10,
+ pc_range=point_cloud_range,
+ img_backbone=dict(
+ type="ResNet",
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=4,
+ norm_cfg=dict(type="BN2d", requires_grad=False),
+ norm_eval=True,
+ style="caffe",
+ dcn=dict(
+ type="DCNv2", deform_groups=1, fallback_on_stride=False
+ ), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True),
+ ),
+ img_neck=dict(
+ type="FPN",
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs="on_output",
+ num_outs=4,
+ relu_before_extra_convs=True,
+ ),
+ freeze_img_backbone=True,
+ freeze_img_neck=False,
+ freeze_bn=False,
+ score_thresh=0.4,
+ filter_score_thresh=0.35,
+ qim_args=dict(
+ qim_type="QIMBase",
+ merger_dropout=0,
+ update_query_pos=True,
+ fp_ratio=0.3,
+ random_drop=0.1,
+ ), # hyper-param for query dropping mentioned in MOTR
+ mem_args=dict(
+ memory_bank_type="MemoryBank",
+ memory_bank_score_thresh=0.0,
+ memory_bank_len=4,
+ ),
+ loss_cfg=dict(
+ type="ClipMatcher",
+ num_classes=10,
+ weight_dict=None,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ assigner=dict(
+ type="HungarianAssigner3DTrack",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ pc_range=point_cloud_range,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_past_traj_weight=0.0,
+ ), # loss cfg for tracking
+ pts_bbox_head=dict(
+ type="BEVFormerTrackHead",
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ transformer=dict(
+ type="UniADPerceptionTransformer",
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type="BEVFormerEncoder",
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type="BEVFormerLayer",
+ attn_cfgs=[
+ dict(
+ type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+ ),
+ dict(
+ type="SpatialCrossAttention",
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type="MSDeformableAttention3D",
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_,
+ ),
+ embed_dims=_dim_,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ decoder=dict(
+ type="DetectionTransformerDecoder",
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type="DetrTransformerDecoderLayer",
+ attn_cfgs=[
+ dict(
+ type="MultiheadAttention",
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1,
+ ),
+ dict(
+ type="CustomMSDeformableAttention",
+ embed_dims=_dim_,
+ num_levels=1,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ ),
+ bbox_coder=dict(
+ type="NMSFreeCoder",
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10,
+ ),
+ positional_encoding=dict(
+ type="LearnedPositionalEncoding",
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+ ),
+ seg_head=dict(
+ type='PansegformerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ canvas_size=canvas_size,
+ pc_range=point_cloud_range,
+ num_query=300,
+ num_classes=4,
+ num_things_classes=3,
+ num_stuff_classes=1,
+ in_channels=2048,
+ sync_cls_avg_factor=True,
+ as_two_stage=False,
+ with_box_refine=True,
+ transformer=dict(
+ type='SegDeformableTransformer',
+ encoder=dict(
+ type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ ),
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+ decoder=dict(
+ type='DeformableDetrTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ )
+ ],
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')
+ ),
+ ),
+ ),
+ positional_encoding=dict(
+ type='SinePositionalEncoding',
+ num_feats=_dim_half_,
+ normalize=True,
+ offset=-0.5),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+ thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+ stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+ train_cfg=dict(
+ assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ ),
+ assigner_with_mask=dict(
+ type='HungarianAssigner_multi_info',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ mask_cost=dict(type='DiceCost', weight=2.0),
+ ),
+ sampler =dict(type='PseudoSampler'),
+ sampler_with_mask =dict(type='PseudoSampler_segformer'),
+ ),
+ ),
+
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type="HungarianAssigner3D",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ iou_cost=dict(
+ type="IoUCost", weight=0.0
+ ), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range,
+ ),
+ )
+ ),
+)
+dataset_type = "NuScenesE2EDataset"
+data_root = "data/nuscenes/"
+info_root = "data/infos/"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"nuscenes_infos_temporal_train.pkl"
+ann_file_val=info_root + f"nuscenes_infos_temporal_val.pkl"
+ann_file_test=info_root + f"nuscenes_infos_temporal_val.pkl"
+
+
+train_pipeline = [
+ dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+ dict(type="PhotoMetricDistortionMultiViewImage"),
+ dict(
+ type="LoadAnnotations3D_E2E",
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+
+ with_future_anns=True, # occ_flow gt
+ with_ins_inds_3d=True, # ins_inds
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+
+ dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ filter_invisible=False), # NOTE: Currently vis_token is not in pkl
+
+ dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+ dict(type="ObjectNameFilterTrack", classes=class_names),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type="DefaultFormatBundle3D", class_names=class_names),
+ dict(
+ type="CustomCollect3D",
+ keys=[
+ "gt_bboxes_3d",
+ "gt_labels_3d",
+ "gt_inds",
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_fut_traj",
+ "gt_fut_traj_mask",
+ "gt_past_traj",
+ "gt_past_traj_mask",
+ "gt_sdc_bbox",
+ "gt_sdc_label",
+ "gt_sdc_fut_traj",
+ "gt_sdc_fut_traj_mask",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # Occ gt
+ # "gt_segmentation",
+ # "gt_instance",
+ # "gt_centerness",
+ # "gt_offset",
+ # "gt_flow",
+ # "gt_backward_flow",
+ # "gt_occ_has_invalid_frame",
+ # "gt_occ_img_is_valid",
+ # # gt future bbox for plan
+ # "gt_future_boxes",
+ # "gt_future_labels",
+ # # planning
+ # "sdc_planning",
+ # "sdc_planning_mask",
+ # "command",
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type='LoadAnnotations3D_E2E',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_attr_label=False,
+
+ with_future_anns=True,
+ with_ins_inds_3d=False,
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+ dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ filter_invisible=False),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ]
+ ),
+ ],
+ ),
+]
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=1,
+ train=dict(
+ type=dataset_type,
+ file_client_args=file_client_args,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ patch_size=patch_size,
+ canvas_size=canvas_size,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ predict_steps=predict_steps,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ occ_receptive_field=3,
+ occ_n_future=occ_n_future_max,
+ occ_filter_invalid_sample=False,
+
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d="LiDAR",
+ ),
+ val=dict(
+ type=dataset_type,
+ file_client_args=file_client_args,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ patch_size=patch_size,
+ canvas_size=canvas_size,
+ bev_size=(bev_h_, bev_w_),
+ predict_steps=predict_steps,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ eval_mod=['det', 'track', 'map'],
+
+ occ_receptive_field=3,
+ occ_n_future=occ_n_future_max,
+ occ_filter_invalid_sample=False,
+ ),
+ test=dict(
+ type=dataset_type,
+ file_client_args=file_client_args,
+ data_root=data_root,
+ test_mode=True,
+ ann_file=ann_file_test,
+ pipeline=test_pipeline,
+ patch_size=patch_size,
+ canvas_size=canvas_size,
+ bev_size=(bev_h_, bev_w_),
+ predict_steps=predict_steps,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ occ_n_future=occ_n_future_max,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ classes=class_names,
+ modality=input_modality,
+ eval_mod=['det', 'map', 'track'],
+ ),
+ shuffler_sampler=dict(type="DistributedGroupSampler"),
+ nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+ type="AdamW",
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ "img_backbone": dict(lr_mult=0.1),
+ }
+ ),
+ weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy="CosineAnnealing",
+ warmup="linear",
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3,
+)
+total_epochs = 6
+evaluation = dict(interval=6, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+ interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=1)
+load_from = "ckpts/bevformer_r101_dcn_24ep.pth"
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py b/adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py
new file mode 100644
index 0000000..2b0308d
--- /dev/null
+++ b/adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py
@@ -0,0 +1,665 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+ "../_base_/default_runtime.py"]
+
+# Update-2023-06-12:
+# [Enhance] Update some freezing args of UniAD
+# [Bugfix] Reproduce the from-scratch results of stage1
+# 1. Remove loss_past_traj in stage1 training
+# 2. Unfreeze neck and BN
+# --> Reproduced tracking result: AMOTA 0.393
+
+
+# Unfreeze neck and BN, the from-scratch results of stage1 could be reproduced
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+
+
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+ use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+
+# NOTE: You can change queue_length from 5 to 3 to save GPU memory, but at risk of performance drop.
+queue_length = 5 # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting
+occ_n_future = 4
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ###
+occflow_grid_conf = {
+ 'xbound': [-50.0, 50.0, 0.5],
+ 'ybound': [-50.0, 50.0, 0.5],
+ 'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+ type="UniAD",
+ gt_iou_threshold=train_gt_iou_threshold,
+ queue_length=queue_length,
+ use_grid_mask=True,
+ video_test_mode=True,
+ num_query=900,
+ num_classes=len(class_names),
+ pc_range=point_cloud_range,
+ img_backbone=dict(
+ type="ResNet",
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=4,
+ norm_cfg=dict(type="BN2d", requires_grad=False),
+ norm_eval=True,
+ style="caffe",
+ dcn=dict(
+ type="DCNv2", deform_groups=1, fallback_on_stride=False
+ ), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True),
+ ),
+ img_neck=dict(
+ type="FPN",
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs="on_output",
+ num_outs=4,
+ relu_before_extra_convs=True,
+ ),
+ freeze_img_backbone=True,
+ freeze_img_neck=False,
+ freeze_bn=False,
+ score_thresh=0.4,
+ filter_score_thresh=0.35,
+ qim_args=dict(
+ qim_type="QIMBase",
+ merger_dropout=0,
+ update_query_pos=True,
+ fp_ratio=0.3,
+ random_drop=0.1,
+ ), # hyper-param for query dropping mentioned in MOTR
+ mem_args=dict(
+ memory_bank_type="MemoryBank",
+ memory_bank_score_thresh=0.0,
+ memory_bank_len=4,
+ ),
+ loss_cfg=dict(
+ type="ClipMatcher",
+ num_classes=len(class_names),
+ weight_dict=None,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ assigner=dict(
+ type="HungarianAssigner3DTrack",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ pc_range=point_cloud_range,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_past_traj_weight=0.0,
+ ), # loss cfg for tracking
+ pts_bbox_head=dict(
+ type="BEVFormerTrackHead",
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=len(class_names),
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ transformer=dict(
+ type="UniADPerceptionTransformer",
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type="BEVFormerEncoder",
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type="BEVFormerLayer",
+ attn_cfgs=[
+ dict(
+ type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+ ),
+ dict(
+ type="SpatialCrossAttention",
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type="MSDeformableAttention3D",
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_,
+ ),
+ embed_dims=_dim_,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ decoder=dict(
+ type="DetectionTransformerDecoder",
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type="DetrTransformerDecoderLayer",
+ attn_cfgs=[
+ dict(
+ type="MultiheadAttention",
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0,
+ ),
+ dict(
+ type="CustomMSDeformableAttention",
+ embed_dims=_dim_,
+ num_levels=1,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ ),
+ bbox_coder=dict(
+ type="NMSFreeCoder",
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=len(class_names),
+ ),
+ positional_encoding=dict(
+ type="LearnedPositionalEncoding",
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+ ),
+ seg_head=dict(
+ type='PansegformerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ canvas_size=canvas_size,
+ pc_range=point_cloud_range,
+ num_query=300,
+ num_classes=6,
+ num_things_classes=6,
+ num_stuff_classes=0,
+ in_channels=2048,
+ sync_cls_avg_factor=True,
+ as_two_stage=False,
+ with_box_refine=True,
+ transformer=dict(
+ type='SegDeformableTransformer',
+ encoder=dict(
+ type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ ),
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+ decoder=dict(
+ type='DeformableDetrTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0),
+ dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ )
+ ],
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')
+ ),
+ ),
+ ),
+ positional_encoding=dict(
+ type='SinePositionalEncoding',
+ num_feats=_dim_half_,
+ normalize=True,
+ offset=-0.5),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+ thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+ stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+ train_cfg=dict(
+ assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ ),
+ assigner_with_mask=dict(
+ type='HungarianAssigner_multi_info',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ mask_cost=dict(type='DiceCost', weight=2.0),
+ ),
+ sampler =dict(type='PseudoSampler'),
+ sampler_with_mask =dict(type='PseudoSampler_segformer'),
+ ),
+ ),
+
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type="HungarianAssigner3D",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ iou_cost=dict(
+ type="IoUCost", weight=0.0
+ ), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range,
+ ),
+ )
+ ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+ dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+ dict(type="PhotoMetricDistortionMultiViewImage"),
+ dict(
+ type="LoadAnnotations3D_E2E",
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+ with_vis_token=False,
+ with_future_anns=False, # occ_flow gt
+ with_ins_inds_3d=True, # ins_inds
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+
+ # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ # filter_invisible=False), # NOTE: Currently vis_token is not in pkl
+
+ dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+ dict(type="ObjectNameFilterTrack", classes=class_names),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type="DefaultFormatBundle3D", class_names=class_names),
+ dict(
+ type="CustomCollect3D",
+ keys=[
+ "gt_bboxes_3d",
+ "gt_labels_3d",
+ "gt_inds",
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_fut_traj",
+ "gt_fut_traj_mask",
+ "gt_past_traj",
+ "gt_past_traj_mask",
+ "gt_sdc_bbox",
+ "gt_sdc_label",
+ "gt_sdc_fut_traj",
+ "gt_sdc_fut_traj_mask",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # Occ gt
+ # "gt_segmentation",
+ # "gt_instance",
+ # "gt_centerness",
+ # "gt_offset",
+ # "gt_flow",
+ # "gt_backward_flow",
+ # "gt_occ_has_invalid_frame",
+ # "gt_occ_img_is_valid",
+ # # gt future bbox for plan
+ # "gt_future_boxes",
+ # "gt_future_labels",
+ # # planning
+ # "sdc_planning",
+ # "sdc_planning_mask",
+ # "command",
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type='LoadAnnotations3D_E2E',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_attr_label=False,
+ with_vis_token=False,
+ with_future_anns=False,
+ with_ins_inds_3d=False,
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+ # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ # filter_invisible=False),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # "gt_segmentation",
+ # "gt_instance",
+ # "gt_centerness",
+ # "gt_offset",
+ # "gt_flow",
+ # "gt_backward_flow",
+ # "gt_occ_has_invalid_frame",
+ # "gt_occ_img_is_valid",
+ # # planning
+ # "sdc_planning",
+ # "sdc_planning_mask",
+ # "command",
+ ]
+ ),
+ ],
+ ),
+]
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ patch_size=patch_size,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ point_cloud_range=point_cloud_range,
+ box_type_3d="LiDAR",
+ ),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ shuffler_sampler=dict(type="DistributedGroupSampler"),
+ nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+
+optimizer = dict(
+ type="AdamW",
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ "img_backbone": dict(lr_mult=0.1),
+ }
+ ),
+ weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ by_epoch=False,
+ policy="CosineAnnealing",
+ warmup="linear",
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3,
+)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+ interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py b/adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py
new file mode 100644
index 0000000..c94ff40
--- /dev/null
+++ b/adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py
@@ -0,0 +1,656 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+ "../_base_/default_runtime.py"]
+
+# Update-2023-06-12:
+# [Enhance] Update some freezing args of UniAD
+# [Bugfix] Reproduce the from-scratch results of stage1
+# 1. Remove loss_past_traj in stage1 training
+# 2. Unfreeze neck and BN
+# --> Reproduced tracking result: AMOTA 0.393
+
+
+# Unfreeze neck and BN, the from-scratch results of stage1 could be reproduced
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+ use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 100
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_*2, bev_w_*2)
+
+# NOTE: You can change queue_length from 5 to 3 to save GPU memory, but at risk of performance drop.
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting
+occ_n_future = 4
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ###
+occflow_grid_conf = {
+ 'xbound': [-50.0, 50.0, 0.5],
+ 'ybound': [-50.0, 50.0, 0.5],
+ 'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+ type="UniAD",
+ gt_iou_threshold=train_gt_iou_threshold,
+ queue_length=queue_length,
+ use_grid_mask=True,
+ video_test_mode=True,
+ num_query=900,
+ num_classes=len(class_names),
+ pc_range=point_cloud_range,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1,2,3),
+ frozen_stages=4,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type="FPN",
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs="on_output",
+ num_outs=4,
+ relu_before_extra_convs=True,
+ ),
+ freeze_img_backbone=True,
+ freeze_img_neck=False,
+ freeze_bn=False,
+ score_thresh=0.4,
+ filter_score_thresh=0.35,
+ qim_args=dict(
+ qim_type="QIMBase",
+ merger_dropout=0,
+ update_query_pos=True,
+ fp_ratio=0.3,
+ random_drop=0.1,
+ ), # hyper-param for query dropping mentioned in MOTR
+ mem_args=dict(
+ memory_bank_type="MemoryBank",
+ memory_bank_score_thresh=0.0,
+ memory_bank_len=4,
+ ),
+ loss_cfg=dict(
+ type="ClipMatcher",
+ num_classes=len(class_names),
+ weight_dict=None,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ assigner=dict(
+ type="HungarianAssigner3DTrack",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ pc_range=point_cloud_range,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_past_traj_weight=0.0,
+ ), # loss cfg for tracking
+ pts_bbox_head=dict(
+ type="BEVFormerTrackHead",
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=len(class_names),
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ transformer=dict(
+ type="UniADPerceptionTransformer",
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type="BEVFormerEncoder",
+ num_layers=3,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type="BEVFormerLayer",
+ attn_cfgs=[
+ dict(
+ type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+ ),
+ dict(
+ type="SpatialCrossAttention",
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type="MSDeformableAttention3D",
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_,
+ ),
+ embed_dims=_dim_,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ decoder=dict(
+ type="DetectionTransformerDecoder",
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type="DetrTransformerDecoderLayer",
+ attn_cfgs=[
+ dict(
+ type="MultiheadAttention",
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0,
+ ),
+ dict(
+ type="CustomMSDeformableAttention",
+ embed_dims=_dim_,
+ num_levels=1,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ ),
+ bbox_coder=dict(
+ type="NMSFreeCoder",
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=len(class_names),
+ ),
+ positional_encoding=dict(
+ type="LearnedPositionalEncoding",
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+ ),
+ seg_head=dict(
+ type='PansegformerHead',
+ bev_h=bev_h_*2,
+ bev_w=bev_w_*2,
+ canvas_size=canvas_size,
+ pc_range=point_cloud_range,
+ num_query=300,
+ num_classes=6,
+ num_things_classes=6,
+ num_stuff_classes=0,
+ in_channels=2048,
+ sync_cls_avg_factor=True,
+ as_two_stage=False,
+ with_box_refine=True,
+ transformer=dict(
+ type='SegDeformableTransformer',
+ encoder=dict(
+ type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ ),
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+ decoder=dict(
+ type='DeformableDetrTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0),
+ dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ )
+ ],
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')
+ ),
+ ),
+ ),
+ positional_encoding=dict(
+ type='SinePositionalEncoding',
+ num_feats=_dim_half_,
+ normalize=True,
+ offset=-0.5),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+ thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+ stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+ train_cfg=dict(
+ assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ ),
+ assigner_with_mask=dict(
+ type='HungarianAssigner_multi_info',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ mask_cost=dict(type='DiceCost', weight=2.0),
+ ),
+ sampler =dict(type='PseudoSampler'),
+ sampler_with_mask =dict(type='PseudoSampler_segformer'),
+ ),
+ ),
+
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type="HungarianAssigner3D",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ iou_cost=dict(
+ type="IoUCost", weight=0.0
+ ), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range,
+ ),
+ )
+ ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+ dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+ dict(type="PhotoMetricDistortionMultiViewImage"),
+ dict(
+ type="LoadAnnotations3D_E2E",
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+
+ with_future_anns=False, # occ_flow gt
+ with_ins_inds_3d=True, # ins_inds
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+
+ # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ # filter_invisible=False), # NOTE: Currently vis_token is not in pkl
+
+ dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+ dict(type="ObjectNameFilterTrack", classes=class_names),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type="DefaultFormatBundle3D", class_names=class_names),
+ dict(
+ type="CustomCollect3D",
+ keys=[
+ "gt_bboxes_3d",
+ "gt_labels_3d",
+ "gt_inds",
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_fut_traj",
+ "gt_fut_traj_mask",
+ "gt_past_traj",
+ "gt_past_traj_mask",
+ "gt_sdc_bbox",
+ "gt_sdc_label",
+ "gt_sdc_fut_traj",
+ "gt_sdc_fut_traj_mask",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # Occ gt
+ # "gt_segmentation",
+ # "gt_instance",
+ # "gt_centerness",
+ # "gt_offset",
+ # "gt_flow",
+ # "gt_backward_flow",
+ # "gt_occ_has_invalid_frame",
+ # "gt_occ_img_is_valid",
+ # # gt future bbox for plan
+ # "gt_future_boxes",
+ # "gt_future_labels",
+ # # planning
+ # "sdc_planning",
+ # "sdc_planning_mask",
+ # "command",
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type='LoadAnnotations3D_E2E',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_attr_label=False,
+
+ with_future_anns=False,
+ with_ins_inds_3d=False,
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+ # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ # filter_invisible=False),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ # "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # "gt_segmentation",
+ # "gt_instance",
+ # "gt_centerness",
+ # "gt_offset",
+ # "gt_flow",
+ # "gt_backward_flow",
+ # "gt_occ_has_invalid_frame",
+ # "gt_occ_img_is_valid",
+ # # planning
+ # "sdc_planning",
+ # "sdc_planning_mask",
+ # "command",
+ ]
+ ),
+ ],
+ ),
+]
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ patch_size=patch_size,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ point_cloud_range=point_cloud_range,
+ box_type_3d="LiDAR",
+ ),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ shuffler_sampler=dict(type="DistributedGroupSampler"),
+ nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+ type="AdamW",
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ "img_backbone": dict(lr_mult=0.1),
+ }
+ ),
+ weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ by_epoch=False,
+ policy="CosineAnnealing",
+ warmup="linear",
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3,
+)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+ interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage2_e2e/base_e2e.py b/adzoo/uniad/configs/stage2_e2e/base_e2e.py
new file mode 100644
index 0000000..86a09fd
--- /dev/null
+++ b/adzoo/uniad/configs/stage2_e2e/base_e2e.py
@@ -0,0 +1,696 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+ "../_base_/default_runtime.py"]
+
+# Update-2023-06-12:
+# [Enhance] Update some freezing args of UniAD
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+ "car",
+ "truck",
+ "construction_vehicle",
+ "bus",
+ "trailer",
+ "barrier",
+ "motorcycle",
+ "bicycle",
+ "pedestrian",
+ "traffic_cone",
+]
+vehicle_id_list = [0, 1, 2, 3, 4, 6, 7]
+group_id_list = [[0,1,2,3,4], [6,7], [8], [5,9]]
+input_modality = dict(
+ use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting
+occ_n_future = 4
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ###
+occflow_grid_conf = {
+ 'xbound': [-50.0, 50.0, 0.5],
+ 'ybound': [-50.0, 50.0, 0.5],
+ 'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+ type="UniAD",
+ gt_iou_threshold=train_gt_iou_threshold,
+ queue_length=queue_length,
+ use_grid_mask=True,
+ video_test_mode=True,
+ num_query=900,
+ num_classes=10,
+ vehicle_id_list=vehicle_id_list,
+ pc_range=point_cloud_range,
+ img_backbone=dict(
+ type="ResNet",
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=4,
+ norm_cfg=dict(type="BN2d", requires_grad=False),
+ norm_eval=True,
+ style="caffe",
+ dcn=dict(
+ type="DCNv2", deform_groups=1, fallback_on_stride=False
+ ), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True),
+ ),
+ img_neck=dict(
+ type="FPN",
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs="on_output",
+ num_outs=4,
+ relu_before_extra_convs=True,
+ ),
+ freeze_img_backbone=True,
+ freeze_img_neck=True,
+ freeze_bn=True,
+ freeze_bev_encoder=True,
+ score_thresh=0.4,
+ filter_score_thresh=0.35,
+ qim_args=dict(
+ qim_type="QIMBase",
+ merger_dropout=0,
+ update_query_pos=True,
+ fp_ratio=0.3,
+ random_drop=0.1,
+ ), # hyper-param for query dropping mentioned in MOTR
+ mem_args=dict(
+ memory_bank_type="MemoryBank",
+ memory_bank_score_thresh=0.0,
+ memory_bank_len=4,
+ ),
+ loss_cfg=dict(
+ type="ClipMatcher",
+ num_classes=10,
+ weight_dict=None,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ assigner=dict(
+ type="HungarianAssigner3DTrack",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ pc_range=point_cloud_range,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ ), # loss cfg for tracking
+ pts_bbox_head=dict(
+ type="BEVFormerTrackHead",
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=10,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ transformer=dict(
+ type="UniADPerceptionTransformer",
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type="BEVFormerEncoder",
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type="BEVFormerLayer",
+ attn_cfgs=[
+ dict(
+ type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+ ),
+ dict(
+ type="SpatialCrossAttention",
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type="MSDeformableAttention3D",
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_,
+ ),
+ embed_dims=_dim_,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ decoder=dict(
+ type="DetectionTransformerDecoder",
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type="DetrTransformerDecoderLayer",
+ attn_cfgs=[
+ dict(
+ type="MultiheadAttention",
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1,
+ ),
+ dict(
+ type="CustomMSDeformableAttention",
+ embed_dims=_dim_,
+ num_levels=1,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ ),
+ bbox_coder=dict(
+ type="NMSFreeCoder",
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=10,
+ ),
+ positional_encoding=dict(
+ type="LearnedPositionalEncoding",
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+ ),
+ seg_head=dict(
+ type='PansegformerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ canvas_size=canvas_size,
+ pc_range=point_cloud_range,
+ num_query=300,
+ num_classes=4,
+ num_things_classes=3,
+ num_stuff_classes=1,
+ in_channels=2048,
+ sync_cls_avg_factor=True,
+ as_two_stage=False,
+ with_box_refine=True,
+ transformer=dict(
+ type='SegDeformableTransformer',
+ encoder=dict(
+ type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ ),
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+ decoder=dict(
+ type='DeformableDetrTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ )
+ ],
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')
+ ),
+ ),
+ ),
+ positional_encoding=dict(
+ type='SinePositionalEncoding',
+ num_feats=_dim_half_,
+ normalize=True,
+ offset=-0.5),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+ thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+ stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+ train_cfg=dict(
+ assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ ),
+ assigner_with_mask=dict(
+ type='HungarianAssigner_multi_info',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ mask_cost=dict(type='DiceCost', weight=2.0),
+ ),
+ sampler =dict(type='PseudoSampler'),
+ sampler_with_mask =dict(type='PseudoSampler_segformer'),
+ ),
+ ),
+ occ_head=dict(
+ type='OccHead',
+
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+
+ bev_proj_dim=256,
+ bev_proj_nlayers=4,
+
+ # Transformer
+ attn_mask_thresh=0.3,
+ transformer_decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=5,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(
+ type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ attn_drop=0.0,
+ proj_drop=0.0,
+ dropout_layer=None,
+ batch_first=False),
+ ffn_cfgs=dict(
+ embed_dims=256,
+ feedforward_channels=2048, # change to 512
+ num_fcs=2,
+ act_cfg=dict(type='ReLU', inplace=True),
+ ffn_drop=0.0,
+ dropout_layer=None,
+ add_identity=True),
+ feedforward_channels=2048,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')),
+ init_cfg=None),
+ # Query
+ query_dim=256,
+ query_mlp_layers=3,
+
+ aux_loss_weight=1.,
+ loss_mask=dict(
+ type='FieryBinarySegmentationLoss',
+ use_top_k=True,
+ top_k_ratio=0.25,
+ future_discount=0.95,
+ loss_weight=5.0,
+ ignore_index=255,
+ ),
+ loss_dice=dict(
+ type='DiceLossWithMasks',
+ use_sigmoid=True,
+ activate=True,
+ reduction='mean',
+ naive_dice=True,
+ eps=1.0,
+ ignore_index=255,
+ loss_weight=1.0),
+
+
+ pan_eval=True,
+ test_seg_thresh=0.1,
+ test_with_track_score=True,
+ ),
+ motion_head=dict(
+ type='MotionHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=300,
+ num_classes=10,
+ predict_steps=predict_steps,
+ predict_modes=predict_modes,
+ embed_dims=_dim_,
+ loss_traj=dict(type='TrajLoss',
+ use_variance=True,
+ cls_loss_weight=0.5,
+ nll_loss_weight=0.5,
+ loss_weight_minade=0.,
+ loss_weight_minfde=0.25),
+ num_cls_fcs=3,
+ pc_range=point_cloud_range,
+ group_id_list=group_id_list,
+ num_anchor=6,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ anchor_info_path='data/others/motion_anchor_infos_mode6.pkl',
+ transformerlayers=dict(
+ type='MotionTransformerDecoder',
+ pc_range=point_cloud_range,
+ embed_dims=_dim_,
+ num_layers=3,
+ transformerlayers=dict(
+ type='MotionTransformerAttentionLayer',
+ batch_first=True,
+ attn_cfgs=[
+ dict(
+ type='MotionDeformableAttention',
+ num_steps=predict_steps,
+ embed_dims=_dim_,
+ num_levels=1,
+ num_heads=8,
+ num_points=4,
+ sample_index=-1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm')),
+ ),
+ ),
+ planning_head=dict(
+ type='PlanningHeadSingleMode',
+ embed_dims=256,
+ planning_steps=planning_steps,
+ loss_planning=dict(type='PlanningLoss'),
+ loss_collision=[dict(type='CollisionLoss', delta=0.0, weight=2.5),
+ dict(type='CollisionLoss', delta=0.5, weight=1.0),
+ dict(type='CollisionLoss', delta=1.0, weight=0.25)],
+ use_col_optim=use_col_optim,
+ planning_eval=True,
+ with_adapter=True,
+ ),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type="HungarianAssigner3D",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ iou_cost=dict(
+ type="IoUCost", weight=0.0
+ ), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range,
+ ),
+ )
+ ),
+)
+dataset_type = "NuScenesE2EDataset"
+data_root = "data/nuscenes/"
+info_root = "data/infos/"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"nuscenes_infos_temporal_train.pkl"
+ann_file_val=info_root + f"nuscenes_infos_temporal_val.pkl"
+ann_file_test=info_root + f"nuscenes_infos_temporal_val.pkl"
+
+
+train_pipeline = [
+ dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+ dict(type="PhotoMetricDistortionMultiViewImage"),
+ dict(
+ type="LoadAnnotations3D_E2E",
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+
+ with_future_anns=True, # occ_flow gt
+ with_ins_inds_3d=True, # ins_inds
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+
+ dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ filter_invisible=False), # NOTE: Currently vis_token is not in pkl
+
+ dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+ dict(type="ObjectNameFilterTrack", classes=class_names),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type="DefaultFormatBundle3D", class_names=class_names),
+ dict(
+ type="CustomCollect3D",
+ keys=[
+ "gt_bboxes_3d",
+ "gt_labels_3d",
+ "gt_inds",
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_fut_traj",
+ "gt_fut_traj_mask",
+ "gt_past_traj",
+ "gt_past_traj_mask",
+ "gt_sdc_bbox",
+ "gt_sdc_label",
+ "gt_sdc_fut_traj",
+ "gt_sdc_fut_traj_mask",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # Occ gt
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # gt future bbox for plan
+ "gt_future_boxes",
+ "gt_future_labels",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type='LoadAnnotations3D_E2E',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_attr_label=False,
+
+ with_future_anns=True,
+ with_ins_inds_3d=False,
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+ dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True,
+ filter_invisible=False),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ]
+ ),
+ ],
+ ),
+]
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=8,
+ train=dict(
+ type=dataset_type,
+ file_client_args=file_client_args,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ patch_size=patch_size,
+ canvas_size=canvas_size,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ predict_steps=predict_steps,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+
+ occ_receptive_field=3,
+ occ_n_future=occ_n_future_max,
+ occ_filter_invalid_sample=False,
+
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d="LiDAR",
+ ),
+ val=dict(
+ type=dataset_type,
+ file_client_args=file_client_args,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ patch_size=patch_size,
+ canvas_size=canvas_size,
+ bev_size=(bev_h_, bev_w_),
+ predict_steps=predict_steps,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ eval_mod=['det', 'map', 'track','motion'],
+
+
+ occ_receptive_field=3,
+ occ_n_future=occ_n_future_max,
+ occ_filter_invalid_sample=False,
+ ),
+ test=dict(
+ type=dataset_type,
+ file_client_args=file_client_args,
+ data_root=data_root,
+ test_mode=True,
+ ann_file=ann_file_test,
+ pipeline=test_pipeline,
+ patch_size=patch_size,
+ canvas_size=canvas_size,
+ bev_size=(bev_h_, bev_w_),
+ predict_steps=predict_steps,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ occ_n_future=occ_n_future_max,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ classes=class_names,
+ modality=input_modality,
+ eval_mod=['det', 'map', 'track','motion'],
+ ),
+ shuffler_sampler=dict(type="DistributedGroupSampler"),
+ nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+ type="AdamW",
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ "img_backbone": dict(lr_mult=0.1),
+ }
+ ),
+ weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy="CosineAnnealing",
+ warmup="linear",
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3,
+)
+total_epochs = 2
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+ interval=10, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=1)
+load_from = "ckpts/uniad_base_track_map.pth"
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py b/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py
new file mode 100644
index 0000000..a0e156c
--- /dev/null
+++ b/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py
@@ -0,0 +1,819 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+ "../_base_/default_runtime.py"]
+
+# Update-2023-06-12:
+# [Enhance] Update some freezing args of UniAD
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+vehicle_id_list = [0,1,2]
+group_id_list = [[0, 1, 2], [3], [7]]
+
+input_modality = dict(
+ use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting
+occ_n_future = 4
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ###
+occflow_grid_conf = {
+ 'xbound': [-50.0, 50.0, 0.5],
+ 'ybound': [-50.0, 50.0, 0.5],
+ 'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+ type="UniAD",
+ gt_iou_threshold=train_gt_iou_threshold,
+ queue_length=queue_length,
+ use_grid_mask=True,
+ video_test_mode=True,
+ num_query=900,
+ num_classes=len(class_names),
+ vehicle_id_list=vehicle_id_list,
+ pc_range=point_cloud_range,
+ img_backbone=dict(
+ type="ResNet",
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=4,
+ norm_cfg=dict(type="BN2d", requires_grad=False),
+ norm_eval=True,
+ style="caffe",
+ dcn=dict(
+ type="DCNv2", deform_groups=1, fallback_on_stride=False
+ ), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True),
+ ),
+ img_neck=dict(
+ type="FPN",
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs="on_output",
+ num_outs=4,
+ relu_before_extra_convs=True,
+ ),
+ freeze_img_backbone=True,
+ freeze_img_neck=True,
+ freeze_bn=True,
+ freeze_bev_encoder=True,
+ score_thresh=0.4,
+ filter_score_thresh=0.35,
+ qim_args=dict(
+ qim_type="QIMBase",
+ merger_dropout=0,
+ update_query_pos=True,
+ fp_ratio=0.3,
+ random_drop=0.1,
+ ), # hyper-param for query dropping mentioned in MOTR
+ mem_args=dict(
+ memory_bank_type="MemoryBank",
+ memory_bank_score_thresh=0.0,
+ memory_bank_len=4,
+ ),
+ loss_cfg=dict(
+ type="ClipMatcher",
+ num_classes=len(class_names),
+ weight_dict=None,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ assigner=dict(
+ type="HungarianAssigner3DTrack",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ pc_range=point_cloud_range,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ ), # loss cfg for tracking
+ pts_bbox_head=dict(
+ type="BEVFormerTrackHead",
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=len(class_names),
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ transformer=dict(
+ type="UniADPerceptionTransformer",
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type="BEVFormerEncoder",
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type="BEVFormerLayer",
+ attn_cfgs=[
+ dict(
+ type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+ ),
+ dict(
+ type="SpatialCrossAttention",
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type="MSDeformableAttention3D",
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_,
+ ),
+ embed_dims=_dim_,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ decoder=dict(
+ type="DetectionTransformerDecoder",
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type="DetrTransformerDecoderLayer",
+ attn_cfgs=[
+ dict(
+ type="MultiheadAttention",
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0,
+ ),
+ dict(
+ type="CustomMSDeformableAttention",
+ embed_dims=_dim_,
+ num_levels=1,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ ),
+ bbox_coder=dict(
+ type="NMSFreeCoder",
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=len(class_names),
+ ),
+ positional_encoding=dict(
+ type="LearnedPositionalEncoding",
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+ ),
+ seg_head=dict(
+ type='PansegformerHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ canvas_size=canvas_size,
+ pc_range=point_cloud_range,
+ num_query=300,
+ num_classes=6,
+ num_things_classes=6,
+ num_stuff_classes=0,
+ in_channels=2048,
+ sync_cls_avg_factor=True,
+ as_two_stage=False,
+ with_box_refine=True,
+ transformer=dict(
+ type='SegDeformableTransformer',
+ encoder=dict(
+ type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ ),
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+ decoder=dict(
+ type='DeformableDetrTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0),
+ dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ )
+ ],
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')
+ ),
+ ),
+ ),
+ positional_encoding=dict(
+ type='SinePositionalEncoding',
+ num_feats=_dim_half_,
+ normalize=True,
+ offset=-0.5),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+ thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+ stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+ train_cfg=dict(
+ assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ ),
+ assigner_with_mask=dict(
+ type='HungarianAssigner_multi_info',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ mask_cost=dict(type='DiceCost', weight=2.0),
+ ),
+ sampler =dict(type='PseudoSampler'),
+ sampler_with_mask =dict(type='PseudoSampler_segformer'),
+ ),
+ ),
+ occ_head=dict(
+ type='OccHead',
+
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+
+ bev_proj_dim=256,
+ bev_proj_nlayers=4,
+
+ # Transformer
+ attn_mask_thresh=0.3,
+ transformer_decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=5,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(
+ type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ attn_drop=0.0,
+ proj_drop=0.0,
+ dropout_layer=None,
+ batch_first=False),
+ ffn_cfgs=dict(
+ embed_dims=256,
+ feedforward_channels=2048, # change to 512
+ num_fcs=2,
+ act_cfg=dict(type='ReLU', inplace=True),
+ ffn_drop=0.0,
+ dropout_layer=None,
+ add_identity=True),
+ feedforward_channels=2048,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')),
+ init_cfg=None),
+ # Query
+ query_dim=256,
+ query_mlp_layers=3,
+
+ aux_loss_weight=1.,
+ loss_mask=dict(
+ type='FieryBinarySegmentationLoss',
+ use_top_k=True,
+ top_k_ratio=0.25,
+ future_discount=0.95,
+ loss_weight=5.0,
+ ignore_index=255,
+ ),
+ loss_dice=dict(
+ type='DiceLossWithMasks',
+ use_sigmoid=True,
+ activate=True,
+ reduction='mean',
+ naive_dice=True,
+ eps=1.0,
+ ignore_index=255,
+ loss_weight=1.0),
+
+
+ pan_eval=True,
+ test_seg_thresh=0.1,
+ test_with_track_score=True,
+ ),
+ motion_head=dict(
+ type='MotionHead',
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=300,
+ num_classes=len(class_names),
+ vehicle_id_list=vehicle_id_list,
+ predict_steps=predict_steps,
+ predict_modes=predict_modes,
+ embed_dims=_dim_,
+ loss_traj=dict(type='TrajLoss',
+ use_variance=True,
+ cls_loss_weight=0.5,
+ nll_loss_weight=0.5,
+ loss_weight_minade=0.,
+ loss_weight_minfde=0.25),
+ num_cls_fcs=3,
+ pc_range=point_cloud_range,
+ group_id_list=group_id_list,
+ num_anchor=6,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ anchor_info_path='data/others/b2d_motion_anchor_infos_mode6.pkl',
+ transformerlayers=dict(
+ type='MotionTransformerDecoder',
+ pc_range=point_cloud_range,
+ embed_dims=_dim_,
+ num_layers=3,
+ transformerlayers=dict(
+ type='MotionTransformerAttentionLayer',
+ batch_first=True,
+ attn_cfgs=[
+ dict(
+ type='MotionDeformableAttention',
+ num_steps=predict_steps,
+ embed_dims=_dim_,
+ num_levels=1,
+ num_heads=8,
+ num_points=4,
+ sample_index=-1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm')),
+ ),
+ ),
+ planning_head=dict(
+ type='PlanningHeadSingleMode',
+ embed_dims=256,
+ command_dim=6,
+ planning_steps=planning_steps,
+ loss_planning=dict(type='PlanningLoss'),
+ loss_collision=[dict(type='CollisionLoss', delta=0.0, weight=2.5),
+ dict(type='CollisionLoss', delta=0.5, weight=1.0),
+ dict(type='CollisionLoss', delta=1.0, weight=0.25)],
+ use_col_optim=use_col_optim,
+ planning_eval=True,
+ with_adapter=True,
+ ),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type="HungarianAssigner3D",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ iou_cost=dict(
+ type="IoUCost", weight=0.0
+ ), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range,
+ ),
+ )
+ ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+ dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+ dict(type="PhotoMetricDistortionMultiViewImage"),
+ dict(
+ type="LoadAnnotations3D_E2E",
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+ with_vis_token=False,
+ with_future_anns=True, # occ_flow gt
+ with_ins_inds_3d=True, # ins_inds
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+
+ dict(type='GenerateOccFlowLabels',
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+ only_vehicle=True,
+ filter_invisible=False,
+ all_classes = class_names,
+ vehicle_classes = ['car','van','truck','bicycle'],
+ plan_classes = ['car','van','truck','bicycle','pedestrian'],
+ ),
+
+ dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+ dict(type="ObjectNameFilterTrack", classes=class_names),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type="DefaultFormatBundle3D", class_names=class_names),
+ dict(
+ type="CustomCollect3D",
+ keys=[
+ "gt_bboxes_3d",
+ "gt_labels_3d",
+ "gt_inds",
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_fut_traj",
+ "gt_fut_traj_mask",
+ "gt_past_traj",
+ "gt_past_traj_mask",
+ "gt_sdc_bbox",
+ "gt_sdc_label",
+ "gt_sdc_fut_traj",
+ "gt_sdc_fut_traj_mask",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # Occ gt
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # gt future bbox for plan
+ "gt_future_boxes",
+ "gt_future_labels",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type='LoadAnnotations3D_E2E',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_attr_label=False,
+ with_vis_token=False,
+ with_future_anns=True,
+ with_ins_inds_3d=False,
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+ dict(type='GenerateOccFlowLabels',
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+ only_vehicle=True,
+ filter_invisible=False,
+ all_classes = class_names,
+ vehicle_classes = ['car','van','truck','bicycle'],
+ plan_classes = ['car','van','truck','bicycle','pedestrian'],
+ ),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ]
+ ),
+ ],
+ ),
+]
+
+inference_only_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "command",
+ ]
+ ),
+ ],
+ ),
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ patch_size=patch_size,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ point_cloud_range=point_cloud_range,
+ box_type_3d="LiDAR",
+ ),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ shuffler_sampler=dict(type="DistributedGroupSampler"),
+ nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+ type="AdamW",
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ "img_backbone": dict(lr_mult=0.1),
+ }
+ ),
+ weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ by_epoch=False,
+ policy="CosineAnnealing",
+ warmup="linear",
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3,
+)
+total_epochs = 2
+evaluation = dict(interval=2, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+ interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py b/adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py
new file mode 100644
index 0000000..b6c7a2c
--- /dev/null
+++ b/adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py
@@ -0,0 +1,813 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+ "../_base_/default_runtime.py"]
+
+# Update-2023-06-12:
+# [Enhance] Update some freezing args of UniAD
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+vehicle_id_list = [0,1,2]
+group_id_list = [[0, 1, 2], [3], [7]]
+
+input_modality = dict(
+ use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 100
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_*2, bev_w_*2)
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting
+occ_n_future = 4
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ###
+occflow_grid_conf = {
+ 'xbound': [-50.0, 50.0, 0.5],
+ 'ybound': [-50.0, 50.0, 0.5],
+ 'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+ type="UniAD",
+ gt_iou_threshold=train_gt_iou_threshold,
+ queue_length=queue_length,
+ use_grid_mask=True,
+ video_test_mode=True,
+ prev_frame_num=10,
+ num_query=900,
+ num_classes=len(class_names),
+ vehicle_id_list=vehicle_id_list,
+ pc_range=point_cloud_range,
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1,2,3),
+ frozen_stages=4,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type="FPN",
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs="on_output",
+ num_outs=4,
+ relu_before_extra_convs=True,
+ ),
+ freeze_img_backbone=True,
+ freeze_img_neck=True,
+ freeze_bn=True,
+ freeze_bev_encoder=True,
+ score_thresh=0.4,
+ filter_score_thresh=0.35,
+ qim_args=dict(
+ qim_type="QIMBase",
+ merger_dropout=0,
+ update_query_pos=True,
+ fp_ratio=0.3,
+ random_drop=0.1,
+ ), # hyper-param for query dropping mentioned in MOTR
+ mem_args=dict(
+ memory_bank_type="MemoryBank",
+ memory_bank_score_thresh=0.0,
+ memory_bank_len=4,
+ ),
+ loss_cfg=dict(
+ type="ClipMatcher",
+ num_classes=len(class_names),
+ weight_dict=None,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ assigner=dict(
+ type="HungarianAssigner3DTrack",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ pc_range=point_cloud_range,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ ), # loss cfg for tracking
+ pts_bbox_head=dict(
+ type="BEVFormerTrackHead",
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=900,
+ num_classes=len(class_names),
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ past_steps=past_steps,
+ fut_steps=fut_steps,
+ transformer=dict(
+ type="UniADPerceptionTransformer",
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type="BEVFormerEncoder",
+ num_layers=3,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type="BEVFormerLayer",
+ attn_cfgs=[
+ dict(
+ type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+ ),
+ dict(
+ type="SpatialCrossAttention",
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type="MSDeformableAttention3D",
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_,
+ ),
+ embed_dims=_dim_,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ decoder=dict(
+ type="DetectionTransformerDecoder",
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type="DetrTransformerDecoderLayer",
+ attn_cfgs=[
+ dict(
+ type="MultiheadAttention",
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0,
+ ),
+ dict(
+ type="CustomMSDeformableAttention",
+ embed_dims=_dim_,
+ num_levels=1,
+ ),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=(
+ "self_attn",
+ "norm",
+ "cross_attn",
+ "norm",
+ "ffn",
+ "norm",
+ ),
+ ),
+ ),
+ ),
+ bbox_coder=dict(
+ type="NMSFreeCoder",
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ pc_range=point_cloud_range,
+ max_num=300,
+ voxel_size=voxel_size,
+ num_classes=len(class_names),
+ ),
+ positional_encoding=dict(
+ type="LearnedPositionalEncoding",
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+ ),
+ loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+ loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+ ),
+ seg_head=dict(
+ type='PansegformerHead',
+ bev_h=bev_h_*2,
+ bev_w=bev_w_*2,
+ canvas_size=canvas_size,
+ pc_range=point_cloud_range,
+ num_query=300,
+ num_classes=6,
+ num_things_classes=6,
+ num_stuff_classes=0,
+ in_channels=2048,
+ sync_cls_avg_factor=True,
+ as_two_stage=False,
+ with_box_refine=True,
+ transformer=dict(
+ type='SegDeformableTransformer',
+ encoder=dict(
+ type='DetrTransformerEncoder',
+ num_layers=6,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ ),
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+ decoder=dict(
+ type='DeformableDetrTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.0),
+ dict(
+ type='MultiScaleDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=_num_levels_,
+ )
+ ],
+ feedforward_channels=_feed_dim_,
+ ffn_dropout=0.0,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')
+ ),
+ ),
+ ),
+ positional_encoding=dict(
+ type='SinePositionalEncoding',
+ num_feats=_dim_half_,
+ normalize=True,
+ offset=-0.5),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+ thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+ stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+ train_cfg=dict(
+ assigner=dict(
+ type='HungarianAssigner',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ ),
+ assigner_with_mask=dict(
+ type='HungarianAssigner_multi_info',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+ mask_cost=dict(type='DiceCost', weight=2.0),
+ ),
+ sampler =dict(type='PseudoSampler'),
+ sampler_with_mask =dict(type='PseudoSampler_segformer'),
+ ),
+ ),
+ occ_head=dict(
+ type='OccHead',
+
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+
+ bev_proj_dim=256,
+ bev_proj_nlayers=4,
+
+ # Transformer
+ attn_mask_thresh=0.3,
+ transformer_decoder=dict(
+ type='DetrTransformerDecoder',
+ return_intermediate=True,
+ num_layers=5,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=dict(
+ type='MultiheadAttention',
+ embed_dims=256,
+ num_heads=8,
+ attn_drop=0.0,
+ proj_drop=0.0,
+ dropout_layer=None,
+ batch_first=False),
+ ffn_cfgs=dict(
+ embed_dims=256,
+ feedforward_channels=2048, # change to 512
+ num_fcs=2,
+ act_cfg=dict(type='ReLU', inplace=True),
+ ffn_drop=0.0,
+ dropout_layer=None,
+ add_identity=True),
+ feedforward_channels=2048,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')),
+ init_cfg=None),
+ # Query
+ query_dim=256,
+ query_mlp_layers=3,
+
+ aux_loss_weight=1.,
+ loss_mask=dict(
+ type='FieryBinarySegmentationLoss',
+ use_top_k=True,
+ top_k_ratio=0.25,
+ future_discount=0.95,
+ loss_weight=5.0,
+ ignore_index=255,
+ ),
+ loss_dice=dict(
+ type='DiceLossWithMasks',
+ use_sigmoid=True,
+ activate=True,
+ reduction='mean',
+ naive_dice=True,
+ eps=1.0,
+ ignore_index=255,
+ loss_weight=1.0),
+
+
+ pan_eval=True,
+ test_seg_thresh=0.1,
+ test_with_track_score=True,
+ ),
+ motion_head=dict(
+ type='MotionHead',
+ bev_h=bev_h_*2,
+ bev_w=bev_w_*2,
+ num_query=300,
+ num_classes=len(class_names),
+ predict_steps=predict_steps,
+ predict_modes=predict_modes,
+ embed_dims=_dim_,
+ loss_traj=dict(type='TrajLoss',
+ use_variance=True,
+ cls_loss_weight=0.5,
+ nll_loss_weight=0.5,
+ loss_weight_minade=0.,
+ loss_weight_minfde=0.25),
+ num_cls_fcs=3,
+ pc_range=point_cloud_range,
+ group_id_list=group_id_list,
+ num_anchor=6,
+ use_nonlinear_optimizer=use_nonlinear_optimizer,
+ anchor_info_path='data/others/b2d_motion_anchor_infos_mode6.pkl',
+ transformerlayers=dict(
+ type='MotionTransformerDecoder',
+ pc_range=point_cloud_range,
+ embed_dims=_dim_,
+ num_layers=3,
+ transformerlayers=dict(
+ type='MotionTransformerAttentionLayer',
+ batch_first=True,
+ attn_cfgs=[
+ dict(
+ type='MotionDeformableAttention',
+ num_steps=predict_steps,
+ embed_dims=_dim_,
+ num_levels=1,
+ num_heads=8,
+ num_points=4,
+ sample_index=-1),
+ ],
+
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.0,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm')),
+ ),
+ ),
+ planning_head=dict(
+ type='PlanningHeadSingleMode',
+ embed_dims=256,
+ command_dim=6,
+ planning_steps=planning_steps,
+ loss_planning=dict(type='PlanningLoss'),
+ loss_collision=[dict(type='CollisionLoss', delta=0.0, weight=2.5),
+ dict(type='CollisionLoss', delta=0.5, weight=1.0),
+ dict(type='CollisionLoss', delta=1.0, weight=0.25)],
+ use_col_optim=use_col_optim,
+ planning_eval=True,
+ with_adapter=True,
+ ),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type="HungarianAssigner3D",
+ cls_cost=dict(type="FocalLossCost", weight=2.0),
+ reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+ iou_cost=dict(
+ type="IoUCost", weight=0.0
+ ), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range,
+ ),
+ )
+ ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+train_pipeline = [
+ dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+ dict(type="PhotoMetricDistortionMultiViewImage"),
+ dict(
+ type="LoadAnnotations3D_E2E",
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+ with_vis_token=False,
+ with_future_anns=True, # occ_flow gt
+ with_ins_inds_3d=True, # ins_inds
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+
+ dict(type='GenerateOccFlowLabels',
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+ only_vehicle=True,
+ filter_invisible=False,
+ all_classes = class_names,
+ vehicle_classes = ['car','van','truck','bicycle'],
+ plan_classes = ['car','van','truck','bicycle','pedestrian'],
+ ),
+
+ dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+ dict(type="ObjectNameFilterTrack", classes=class_names),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type="DefaultFormatBundle3D", class_names=class_names),
+ dict(
+ type="CustomCollect3D",
+ keys=[
+ "gt_bboxes_3d",
+ "gt_labels_3d",
+ "gt_inds",
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_fut_traj",
+ "gt_fut_traj_mask",
+ "gt_past_traj",
+ "gt_past_traj_mask",
+ "gt_sdc_bbox",
+ "gt_sdc_label",
+ "gt_sdc_fut_traj",
+ "gt_sdc_fut_traj_mask",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ # Occ gt
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # gt future bbox for plan
+ "gt_future_boxes",
+ "gt_future_labels",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ],
+ ),
+]
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(type='LoadAnnotations3D_E2E',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_attr_label=False,
+ with_vis_token=False,
+ with_future_anns=True,
+ with_ins_inds_3d=False,
+ ins_inds_add_1=True, # ins_inds start from 1
+ ),
+ dict(type='GenerateOccFlowLabels',
+ grid_conf=occflow_grid_conf,
+ ignore_index=255,
+ only_vehicle=True,
+ filter_invisible=False,
+ all_classes = class_names,
+ vehicle_classes = ['car','van','truck','bicycle'],
+ plan_classes = ['car','van','truck','bicycle','pedestrian'],
+ ),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "gt_lane_labels",
+ "gt_lane_bboxes",
+ "gt_lane_masks",
+ "gt_segmentation",
+ "gt_instance",
+ "gt_centerness",
+ "gt_offset",
+ "gt_flow",
+ "gt_backward_flow",
+ "gt_occ_has_invalid_frame",
+ "gt_occ_img_is_valid",
+ # planning
+ "sdc_planning",
+ "sdc_planning_mask",
+ "command",
+ ]
+ ),
+ ],
+ ),
+]
+
+inference_only_pipeline = [
+ dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+ file_client_args=file_client_args, img_root=data_root),
+ dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+ dict(type="PadMultiViewImage", size_divisor=32),
+ dict(
+ type="MultiScaleFlipAug3D",
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+ ),
+ dict(
+ type="CustomCollect3D", keys=[
+ "img",
+ "timestamp",
+ "l2g_r_mat",
+ "l2g_t",
+ "command",
+ ]
+ ),
+ ],
+ ),
+]
+
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ patch_size=patch_size,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ point_cloud_range=point_cloud_range,
+ box_type_3d="LiDAR",
+ ),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ bev_size=(bev_h_, bev_w_),
+ predict_frames=predict_steps,
+ past_frames=past_steps,
+ future_frames=fut_steps,
+ classes=class_names,
+ modality=input_modality,
+ samples_per_gpu=1,
+ point_cloud_range=point_cloud_range,
+ eval_cfg=eval_cfg,
+ #eval_mod=['det', 'track', 'map'],
+ box_type_3d="LiDAR",
+ ),
+ shuffler_sampler=dict(type="DistributedGroupSampler"),
+ nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+ type="AdamW",
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ "img_backbone": dict(lr_mult=0.1),
+ }
+ ),
+ weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ by_epoch=False,
+ policy="CosineAnnealing",
+ warmup="linear",
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3,
+)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+ interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/data_converter/create_data.py b/adzoo/uniad/data_converter/create_data.py
new file mode 100755
index 0000000..0adb360
--- /dev/null
+++ b/adzoo/uniad/data_converter/create_data.py
@@ -0,0 +1,109 @@
+import argparse
+from os import path as osp
+import sys
+from data_converter import uniad_nuscenes_converter as nuscenes_converter
+
+def nuscenes_data_prep(root_path,
+ can_bus_root_path,
+ info_prefix,
+ version,
+ dataset_name,
+ out_dir,
+ max_sweeps=10):
+ """Prepare data related to nuScenes dataset.
+
+ Related data consists of '.pkl' files recording basic infos,
+ 2D annotations and groundtruth database.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ dataset_name (str): The dataset class name.
+ out_dir (str): Output directory of the groundtruth database info.
+ max_sweeps (int): Number of input consecutive frames. Default: 10
+ """
+ nuscenes_converter.create_nuscenes_infos(
+ root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+ if version == 'v1.0-test':
+ info_test_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_test_path, version=version)
+ else:
+ info_train_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+ info_val_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_train_path, version=version)
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_val_path, version=version)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+ '--root-path',
+ type=str,
+ default='./data/kitti',
+ help='specify the root path of dataset')
+parser.add_argument(
+ '--canbus',
+ type=str,
+ default='./data',
+ help='specify the root path of nuScenes canbus')
+parser.add_argument(
+ '--version',
+ type=str,
+ default='v1.0',
+ required=False,
+ help='specify the dataset version, no need for kitti')
+parser.add_argument(
+ '--max-sweeps',
+ type=int,
+ default=10,
+ required=False,
+ help='specify sweeps of lidar per example')
+parser.add_argument(
+ '--out-dir',
+ type=str,
+ default='./data/kitti',
+ required=False,
+ help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+ '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+ if args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+ train_version = f'{args.version}-trainval'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ test_version = f'{args.version}-test'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=test_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+ train_version = f'{args.version}'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
\ No newline at end of file
diff --git a/adzoo/uniad/data_converter/uniad_create_data.sh b/adzoo/uniad/data_converter/uniad_create_data.sh
new file mode 100755
index 0000000..b9ac04d
--- /dev/null
+++ b/adzoo/uniad/data_converter/uniad_create_data.sh
@@ -0,0 +1,7 @@
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python tools/create_data.py nuscenes --root-path ./data/nuscenes \
+ --out-dir ./data/infos \
+ --extra-tag nuscenes \
+ --version v1.0 \
+ --canbus ./data/nuscenes \
\ No newline at end of file
diff --git a/adzoo/uniad/data_converter/uniad_nuscenes_converter.py b/adzoo/uniad/data_converter/uniad_nuscenes_converter.py
new file mode 100644
index 0000000..4ff6ef8
--- /dev/null
+++ b/adzoo/uniad/data_converter/uniad_nuscenes_converter.py
@@ -0,0 +1,723 @@
+import numpy as np
+import os
+from collections import OrderedDict
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from nuscenes.prediction import PredictHelper
+from os import path as osp
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+
+from mmcv.core.bbox.box_np_ops import points_cam2img
+from mmcv.datasets import NuScenesDataset
+from mmcv.fileio.io import load, dump
+from mmcv.utils import is_filepath, track_iter_progress, check_file_exist
+from mmcv.image import imread
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+ 'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+ 'pedestrian.moving', 'pedestrian.standing',
+ 'pedestrian.sitting_lying_down', 'vehicle.moving',
+ 'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+ out_path,
+ can_bus_root_path,
+ info_prefix,
+ version='v1.0-trainval',
+ max_sweeps=10):
+ """Create info file of nuscene dataset.
+
+ Given the raw data, generate its related info file in pkl format.
+
+ Args:
+ root_path (str): Path of the data root.
+ info_prefix (str): Prefix of the info file to be generated.
+ version (str): Version of the data.
+ Default: 'v1.0-trainval'
+ max_sweeps (int): Max number of sweeps.
+ Default: 10
+ """
+ from nuscenes.nuscenes import NuScenes
+ from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+ print(version, root_path)
+ nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+ nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+ from nuscenes.utils import splits
+ available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+ assert version in available_vers
+ if version == 'v1.0-trainval':
+ train_scenes = splits.train
+ val_scenes = splits.val
+ elif version == 'v1.0-test':
+ train_scenes = splits.test
+ val_scenes = []
+ elif version == 'v1.0-mini':
+ train_scenes = splits.mini_train
+ val_scenes = splits.mini_val
+ else:
+ raise ValueError('unknown')
+
+ # filter existing scenes.
+ available_scenes = get_available_scenes(nusc)
+ available_scene_names = [s['name'] for s in available_scenes]
+ train_scenes = list(
+ filter(lambda x: x in available_scene_names, train_scenes))
+ val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+ train_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in train_scenes
+ ])
+ val_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in val_scenes
+ ])
+
+ test = 'test' in version
+ if test:
+ print('test scene: {}'.format(len(train_scenes)))
+ else:
+ print('train scene: {}, val scene: {}'.format(
+ len(train_scenes), len(val_scenes)))
+
+ train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+ nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+ metadata = dict(version=version)
+ if test:
+ print('test sample: {}'.format(len(train_nusc_infos)))
+ data = dict(infos=train_nusc_infos, metadata=metadata)
+ info_path = osp.join(out_path,
+ '{}_infos_temporal_test.pkl'.format(info_prefix))
+ dump(data, info_path)
+ else:
+ print('train sample: {}, val sample: {}'.format(
+ len(train_nusc_infos), len(val_nusc_infos)))
+ data = dict(infos=train_nusc_infos, metadata=metadata)
+ info_path = osp.join(out_path,
+ '{}_infos_temporal_train.pkl'.format(info_prefix))
+ dump(data, info_path)
+ data['infos'] = val_nusc_infos
+ info_val_path = osp.join(out_path,
+ '{}_infos_temporal_val.pkl'.format(info_prefix))
+ dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+ """Get available scenes from the input nuscenes class.
+
+ Given the raw data, get the information of available scenes for
+ further info generation.
+
+ Args:
+ nusc (class): Dataset class in the nuScenes dataset.
+
+ Returns:
+ available_scenes (list[dict]): List of basic information for the
+ available scenes.
+ """
+ available_scenes = []
+ print('total scene num: {}'.format(len(nusc.scene)))
+ for scene in nusc.scene:
+ scene_token = scene['token']
+ scene_rec = nusc.get('scene', scene_token)
+ sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+ sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+ has_more_frames = True
+ scene_not_exist = False
+ while has_more_frames:
+ lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+ lidar_path = str(lidar_path)
+ if os.getcwd() in lidar_path:
+ # path from lyftdataset is absolute path
+ lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+ # relative path
+ if not is_filepath(lidar_path):
+ scene_not_exist = True
+ break
+ else:
+ break
+ if scene_not_exist:
+ continue
+ available_scenes.append(scene)
+ print('exist scene num: {}'.format(len(available_scenes)))
+ return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+ scene_name = nusc.get('scene', sample['scene_token'])['name']
+ sample_timestamp = sample['timestamp']
+ try:
+ pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+ except:
+ return np.zeros(18) # server scenes do not have can bus information.
+ can_bus = []
+ # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+ last_pose = pose_list[0]
+ for i, pose in enumerate(pose_list):
+ if pose['utime'] > sample_timestamp:
+ break
+ last_pose = pose
+ _ = last_pose.pop('utime') # useless
+ pos = last_pose.pop('pos')
+ rotation = last_pose.pop('orientation')
+ can_bus.extend(pos)
+ can_bus.extend(rotation)
+ for key in last_pose.keys():
+ can_bus.extend(pose[key]) # 16 elements
+ can_bus.extend([0., 0.])
+ return np.array(can_bus)
+
+def _get_future_traj_info(nusc, sample, predict_steps=16):
+ sample_token = sample['token']
+ ann_tokens = np.array(sample['anns'])
+ sd_rec = nusc.get('sample', sample_token)
+ fut_traj_all = []
+ fut_traj_valid_mask_all = []
+ _, boxes, _ = nusc.get_sample_data(sd_rec['data']['LIDAR_TOP'], selected_anntokens=ann_tokens)
+ predict_helper = PredictHelper(nusc)
+ for i, ann_token in enumerate(ann_tokens):
+ box = boxes[i]
+ instance_token = nusc.get('sample_annotation', ann_token)['instance_token']
+ fut_traj_local = predict_helper.get_future_for_agent(instance_token,
+ sample_token,
+ seconds=predict_steps//2,
+ in_agent_frame=True)
+
+ fut_traj = np.zeros((predict_steps, 2))
+ fut_traj_valid_mask = np.zeros((predict_steps, 2))
+ if fut_traj_local.shape[0] > 0:
+ # trans = box.center
+ # trans = np.array([0, 0, 0])
+ # rot = Quaternion(matrix=box.rotation_matrix)
+ # fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot)
+ fut_traj_scence_centric = fut_traj_local
+ fut_traj[:fut_traj_scence_centric.shape[0], :] = fut_traj_scence_centric
+ fut_traj_valid_mask[:fut_traj_scence_centric.shape[0], :] = 1
+ fut_traj_all.append(fut_traj)
+ fut_traj_valid_mask_all.append(fut_traj_valid_mask)
+ if len(ann_tokens) > 0:
+ fut_traj_all = np.stack(fut_traj_all, axis=0)
+ fut_traj_valid_mask_all = np.stack(fut_traj_valid_mask_all, axis=0)
+ else:
+ fut_traj_all = np.zeros((0, predict_steps, 2))
+ fut_traj_valid_mask_all = np.zeros((0, predict_steps, 2))
+ return fut_traj_all, fut_traj_valid_mask_all
+
+def _fill_trainval_infos(nusc,
+ nusc_can_bus,
+ train_scenes,
+ val_scenes,
+ test=False,
+ max_sweeps=10):
+ """Generate the train/val infos from the raw data.
+
+ Args:
+ nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+ train_scenes (list[str]): Basic information of training scenes.
+ val_scenes (list[str]): Basic information of validation scenes.
+ test (bool): Whether use the test mode. In the test mode, no
+ annotations can be accessed. Default: False.
+ max_sweeps (int): Max number of sweeps. Default: 10.
+
+ Returns:
+ tuple[list[dict]]: Information of training set and validation set
+ that will be saved to the info file.
+ """
+ train_nusc_infos = []
+ val_nusc_infos = []
+ frame_idx = 0
+ for sample in track_iter_progress(nusc.sample):
+ lidar_token = sample['data']['LIDAR_TOP']
+ sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+ check_file_exist(lidar_path)
+ can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+ ##
+ info = {
+ 'lidar_path': lidar_path,
+ 'token': sample['token'],
+ 'prev': sample['prev'],
+ 'next': sample['next'],
+ 'can_bus': can_bus,
+ 'frame_idx': frame_idx, # temporal related info
+ 'sweeps': [],
+ 'cams': dict(),
+ 'scene_token': sample['scene_token'], # temporal related info
+ 'lidar2ego_translation': cs_record['translation'],
+ 'lidar2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sample['timestamp'],
+ }
+
+ if sample['next'] == '':
+ frame_idx = 0
+ else:
+ frame_idx += 1
+
+ l2e_r = info['lidar2ego_rotation']
+ l2e_t = info['lidar2ego_translation']
+ e2g_r = info['ego2global_rotation']
+ e2g_t = info['ego2global_translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ # obtain 6 image's information per frame
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ for cam in camera_types:
+ cam_token = sample['data'][cam]
+ cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+ cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+ e2g_t, e2g_r_mat, cam)
+ cam_info.update(cam_intrinsic=cam_intrinsic)
+ info['cams'].update({cam: cam_info})
+
+ # obtain sweeps for a single key-frame
+ sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ sweeps = []
+ while len(sweeps) < max_sweeps:
+ if not sd_rec['prev'] == '':
+ sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+ l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+ sweeps.append(sweep)
+ sd_rec = nusc.get('sample_data', sd_rec['prev'])
+ else:
+ break
+ info['sweeps'] = sweeps
+ # obtain annotation
+ if not test:
+ annotations = [
+ nusc.get('sample_annotation', token)
+ for token in sample['anns']
+ ]
+ locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+ dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+ rots = np.array([b.orientation.yaw_pitch_roll[0]
+ for b in boxes]).reshape(-1, 1)
+ velocity = np.array(
+ [nusc.box_velocity(token)[:2] for token in sample['anns']])
+ valid_flag = np.array(
+ [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+ for anno in annotations],
+ dtype=bool).reshape(-1)
+ instance_inds = [nusc.getind('instance', ann['instance_token'])
+ for ann in annotations]
+ future_traj_all, future_traj_valid_mask_all = _get_future_traj_info(nusc, sample)
+ instance_tokens = [ann['instance_token'] for ann in annotations] # dtype('ego->global->ego'->lidar
+ l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+ e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+ R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+ ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+ sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T
+ sweep['sensor2lidar_translation'] = T
+ return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+ """Export 2d annotation from the info file and raw data.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ info_path (str): Path of the info file.
+ version (str): Dataset version.
+ mono3d (bool): Whether to export mono3d annotation. Default: True.
+ """
+ # get bbox annotations for camera
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ nusc_infos = load(info_path)['infos']
+ nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+ # info_2d_list = []
+ cat2Ids = [
+ dict(id=nus_categories.index(cat_name), name=cat_name)
+ for cat_name in nus_categories
+ ]
+ coco_ann_id = 0
+ coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+ for info in track_iter_progress(nusc_infos):
+ for cam in camera_types:
+ cam_info = info['cams'][cam]
+ coco_infos = get_2d_boxes(
+ nusc,
+ cam_info['sample_data_token'],
+ visibilities=['', '1', '2', '3', '4'],
+ mono3d=mono3d)
+ (height, width, _) = imread(cam_info['data_path']).shape
+ coco_2d_dict['images'].append(
+ dict(
+ file_name=cam_info['data_path'].split('data/nuscenes/')
+ [-1],
+ id=cam_info['sample_data_token'],
+ token=info['token'],
+ cam2ego_rotation=cam_info['sensor2ego_rotation'],
+ cam2ego_translation=cam_info['sensor2ego_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ ego2global_translation=info['ego2global_translation'],
+ cam_intrinsic=cam_info['cam_intrinsic'],
+ width=width,
+ height=height))
+ for coco_info in coco_infos:
+ if coco_info is None:
+ continue
+ # add an empty key for coco format
+ coco_info['segmentation'] = []
+ coco_info['id'] = coco_ann_id
+ coco_2d_dict['annotations'].append(coco_info)
+ coco_ann_id += 1
+ if mono3d:
+ json_prefix = f'{info_path[:-4]}_mono3d'
+ else:
+ json_prefix = f'{info_path[:-4]}'
+ dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+ sample_data_token: str,
+ visibilities: List[str],
+ mono3d=True):
+ """Get the 2D annotation records for a given `sample_data_token`.
+
+ Args:
+ sample_data_token (str): Sample data token belonging to a camera \
+ keyframe.
+ visibilities (list[str]): Visibility filter.
+ mono3d (bool): Whether to get boxes with mono3d annotation.
+
+ Return:
+ list[dict]: List of 2D annotation record that belongs to the input
+ `sample_data_token`.
+ """
+
+ # Get the sample data and the sample corresponding to that sample data.
+ sd_rec = nusc.get('sample_data', sample_data_token)
+
+ assert sd_rec[
+ 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+ ' for camera sample_data!'
+ if not sd_rec['is_key_frame']:
+ raise ValueError(
+ 'The 2D re-projections are available only for keyframes.')
+
+ s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+ # Get the calibrated sensor and ego pose
+ # record to get the transformation matrices.
+ cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+ pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+ # Get all the annotation with the specified visibilties.
+ ann_recs = [
+ nusc.get('sample_annotation', token) for token in s_rec['anns']
+ ]
+ ann_recs = [
+ ann_rec for ann_rec in ann_recs
+ if (ann_rec['visibility_token'] in visibilities)
+ ]
+
+ repro_recs = []
+
+ for ann_rec in ann_recs:
+ # Augment sample_annotation with token information.
+ ann_rec['sample_annotation_token'] = ann_rec['token']
+ ann_rec['sample_data_token'] = sample_data_token
+
+ # Get the box in global coordinates.
+ box = nusc.get_box(ann_rec['token'])
+
+ # Move them to the ego-pose frame.
+ box.translate(-np.array(pose_rec['translation']))
+ box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+ # Move them to the calibrated sensor frame.
+ box.translate(-np.array(cs_rec['translation']))
+ box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+ # Filter out the corners that are not in front of the calibrated
+ # sensor.
+ corners_3d = box.corners()
+ in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+ corners_3d = corners_3d[:, in_front]
+
+ # Project 3d box to 2d.
+ corner_coords = view_points(corners_3d, camera_intrinsic,
+ True).T[:, :2].tolist()
+
+ # Keep only corners that fall within the image.
+ final_coords = post_process_coords(corner_coords)
+
+ # Skip if the convex hull of the re-projected corners
+ # does not intersect the image canvas.
+ if final_coords is None:
+ continue
+ else:
+ min_x, min_y, max_x, max_y = final_coords
+
+ # Generate dictionary record to be included in the .json file.
+ repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+ sample_data_token, sd_rec['filename'])
+
+ # If mono3d=True, add 3D annotations in camera coordinates
+ if mono3d and (repro_rec is not None):
+ loc = box.center.tolist()
+
+ dim = box.wlh
+ dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw
+ dim = dim.tolist()
+
+ rot = box.orientation.yaw_pitch_roll[0]
+ rot = [-rot] # convert the rot to our cam coordinate
+
+ global_velo2d = nusc.box_velocity(box.token)[:2]
+ global_velo3d = np.array([*global_velo2d, 0.0])
+ e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+ c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+ cam_velo3d = global_velo3d @ np.linalg.inv(
+ e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+ velo = cam_velo3d[0::2].tolist()
+
+ repro_rec['bbox_cam3d'] = loc + dim + rot
+ repro_rec['velo_cam3d'] = velo
+
+ center3d = np.array(loc).reshape([1, 3])
+ center2d = points_cam2img(
+ center3d, camera_intrinsic, with_depth=True)
+ repro_rec['center2d'] = center2d.squeeze().tolist()
+ # normalized center2D + depth
+ # if samples with depth < 0 will be removed
+ if repro_rec['center2d'][2] <= 0:
+ continue
+
+ ann_token = nusc.get('sample_annotation',
+ box.token)['attribute_tokens']
+ if len(ann_token) == 0:
+ attr_name = 'None'
+ else:
+ attr_name = nusc.get('attribute', ann_token[0])['name']
+ attr_id = nus_attributes.index(attr_name)
+ repro_rec['attribute_name'] = attr_name
+ repro_rec['attribute_id'] = attr_id
+
+ repro_recs.append(repro_rec)
+
+ return repro_recs
+
+
+def post_process_coords(
+ corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+ """Get the intersection of the convex hull of the reprojected bbox corners
+ and the image canvas, return None if no intersection.
+
+ Args:
+ corner_coords (list[int]): Corner coordinates of reprojected
+ bounding box.
+ imsize (tuple[int]): Size of the image canvas.
+
+ Return:
+ tuple [float]: Intersection of the convex hull of the 2D box
+ corners and the image canvas.
+ """
+ polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+ img_canvas = box(0, 0, imsize[0], imsize[1])
+
+ if polygon_from_2d_box.intersects(img_canvas):
+ img_intersection = polygon_from_2d_box.intersection(img_canvas)
+ intersection_coords = np.array(
+ [coord for coord in img_intersection.exterior.coords])
+
+ min_x = min(intersection_coords[:, 0])
+ min_y = min(intersection_coords[:, 1])
+ max_x = max(intersection_coords[:, 0])
+ max_y = max(intersection_coords[:, 1])
+
+ return min_x, min_y, max_x, max_y
+ else:
+ return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+ sample_data_token: str, filename: str) -> OrderedDict:
+ """Generate one 2D annotation record given various informations on top of
+ the 2D bounding box coordinates.
+
+ Args:
+ ann_rec (dict): Original 3d annotation record.
+ x1 (float): Minimum value of the x coordinate.
+ y1 (float): Minimum value of the y coordinate.
+ x2 (float): Maximum value of the x coordinate.
+ y2 (float): Maximum value of the y coordinate.
+ sample_data_token (str): Sample data token.
+ filename (str):The corresponding image file where the annotation
+ is present.
+
+ Returns:
+ dict: A sample 2D annotation record.
+ - file_name (str): flie name
+ - image_id (str): sample data token
+ - area (float): 2d box area
+ - category_name (str): category name
+ - category_id (int): category id
+ - bbox (list[float]): left x, top y, dx, dy of 2d box
+ - iscrowd (int): whether the area is crowd
+ """
+ repro_rec = OrderedDict()
+ repro_rec['sample_data_token'] = sample_data_token
+ coco_rec = dict()
+
+ relevant_keys = [
+ 'attribute_tokens',
+ 'category_name',
+ 'instance_token',
+ 'next',
+ 'num_lidar_pts',
+ 'num_radar_pts',
+ 'prev',
+ 'sample_annotation_token',
+ 'sample_data_token',
+ 'visibility_token',
+ ]
+
+ for key, value in ann_rec.items():
+ if key in relevant_keys:
+ repro_rec[key] = value
+
+ repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+ repro_rec['filename'] = filename
+
+ coco_rec['file_name'] = filename
+ coco_rec['image_id'] = sample_data_token
+ coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+ if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+ return None
+ cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+ coco_rec['category_name'] = cat_name
+ coco_rec['category_id'] = nus_categories.index(cat_name)
+ coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+ coco_rec['iscrowd'] = 0
+
+ return coco_rec
+
\ No newline at end of file
diff --git a/adzoo/uniad/test.py b/adzoo/uniad/test.py
new file mode 100755
index 0000000..9442514
--- /dev/null
+++ b/adzoo/uniad/test.py
@@ -0,0 +1,145 @@
+import argparse
+import torch
+import os
+import warnings
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.utils import get_dist_info, init_dist, wrap_fp16_model, set_random_seed, Config, DictAction, load_checkpoint
+from mmcv.fileio.io import dump
+from mmcv.datasets import build_dataset, build_dataloader, replace_ImageToTensor
+from mmcv.models import build_model, fuse_conv_bn
+import time
+import os.path as osp
+from adzoo.uniad.test_utils import custom_multi_gpu_test, custom_single_gpu_test
+import cv2
+cv2.setNumThreads(1)
+
+warnings.filterwarnings("ignore")
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet test (and eval) a model')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('checkpoint', help='checkpoint file')
+ parser.add_argument('--out', default='output/results.pkl', help='output result file in pickle format')
+ parser.add_argument(
+ '--fuse-conv-bn',
+ action='store_true',
+ help='Whether to fuse conv and bn, this will slightly increase'
+ 'the inference speed')
+ parser.add_argument(
+ '--eval',
+ type=str,
+ nargs='+',
+ help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+ ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+ parser.add_argument('--show', action='store_true', help='show results')
+ parser.add_argument(
+ '--show-dir', help='directory where results will be saved')
+ parser.add_argument(
+ '--gpu-collect',
+ action='store_true',
+ help='whether to use gpu to collect results.')
+ parser.add_argument(
+ '--tmpdir',
+ help='tmp directory used for collecting results from multiple '
+ 'workers, available when gpu-collect is not specified')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local-rank', type=int, default=0)
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+ return args
+
+
+def main():
+ args = parse_args()
+ cfg = Config.fromfile(args.config)
+
+ cfg.model.pretrained = None
+ cfg.data.test.test_mode = True
+ samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+ if samples_per_gpu > 1:
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ else:
+ distributed = True
+ torch.backends.cudnn.benchmark = True
+ init_dist(args.launcher, **cfg.dist_params)
+ rank, world_size = get_dist_info()
+
+ set_random_seed(args.seed, deterministic=args.deterministic)
+
+ # Dataloader
+ dataset = build_dataset(cfg.data.test)
+ data_loader = build_dataloader(dataset,
+ samples_per_gpu=samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+ )
+
+ # Model
+ cfg.model.train_cfg = None
+ model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(model)
+ checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+ if args.fuse_conv_bn:
+ model = fuse_conv_bn(model)
+
+ # Add classese info
+ if 'CLASSES' in checkpoint.get('meta', {}): # for det
+ model.CLASSES = checkpoint['meta']['CLASSES']
+ else:
+ model.CLASSES = dataset.CLASSES
+ if 'PALETTE' in checkpoint.get('meta', {}): # for seg
+ model.PALETTE = checkpoint['meta']['PALETTE']
+ elif hasattr(dataset, 'PALETTE'):
+ model.PALETTE = dataset.PALETTE
+
+ if not distributed:
+ assert False #TODO(yzj)
+ # model = MMDataParallel(model, device_ids=[0])
+ # outputs = custom_single_gpu_test(model, data_loader, args.show, args.show_dir)
+ else:
+ model = DistributedDataParallel(model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ )
+ outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect)
+
+
+
+ if rank == 0:
+ if args.out:
+ print(f'\nwriting results to {args.out}')
+ dump(outputs, args.out)
+ kwargs = {}
+ kwargs['jsonfile_prefix'] = osp.join('test', args.config.split('/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+
+ if args.eval:
+ eval_kwargs = cfg.get('evaluation', {}).copy()
+ # hard-code way to remove EvalHook args
+ for key in ['interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule']:
+ eval_kwargs.pop(key, None)
+ eval_kwargs.update(dict(metric=args.eval, **kwargs))
+ print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/uniad/test_utils.py b/adzoo/uniad/test_utils.py
new file mode 100644
index 0000000..4be8936
--- /dev/null
+++ b/adzoo/uniad/test_utils.py
@@ -0,0 +1,318 @@
+import os
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+
+from mmcv.models.dense_heads.occ_head_plugin import IntersectionOverUnion, PanopticMetric
+from mmcv.models.dense_heads.planning_head_plugin import UniADPlanningMetric
+from mmcv.utils import ProgressBar, mkdir_or_exist, get_dist_info
+from mmcv.fileio.io import load, dump
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+ """Encode bitmap mask to RLE code. Semantic Masks only
+ Args:
+ mask_results (list | tuple[list]): bitmap mask results.
+ In mask scoring rcnn, mask_results is a tuple of (segm_results,
+ segm_cls_score).
+ Returns:
+ list | tuple: RLE encoded mask.
+ """
+ cls_segms = mask_results
+ num_classes = len(cls_segms)
+ encoded_mask_results = []
+ for i in range(len(cls_segms)):
+ encoded_mask_results.append(
+ mask_util.encode(
+ np.array(
+ cls_segms[i][:, :, np.newaxis], order='F',
+ dtype='uint8'))[0]) # encoded with RLE
+ return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+ """Test model with multiple gpus.
+ This method tests model with multiple gpus and collects the results
+ under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+ it encodes results to gpu tensors and use gpu communication for results
+ collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+ and collects them by the rank 0 worker.
+ Args:
+ model (nn.Module): Model to be tested.
+ data_loader (nn.Dataloader): Pytorch data loader.
+ tmpdir (str): Path of directory to save the temporary results from
+ different gpus under cpu mode.
+ gpu_collect (bool): Option to use either gpu or cpu to collect results.
+ Returns:
+ list: The prediction results.
+ """
+ model.eval()
+
+ # Occ eval init
+ eval_occ = hasattr(model.module, 'with_occ_head') \
+ and model.module.with_occ_head
+ if eval_occ:
+ # 30mx30m, 100mx100m at 50cm resolution
+ EVALUATION_RANGES = {'30x30': (70, 130),
+ '100x100': (0, 200)}
+ n_classes = 2
+ iou_metrics = {}
+ for key in EVALUATION_RANGES.keys():
+ iou_metrics[key] = IntersectionOverUnion(n_classes).cuda()
+ panoptic_metrics = {}
+ for key in EVALUATION_RANGES.keys():
+ panoptic_metrics[key] = PanopticMetric(n_classes=n_classes, temporally_consistent=True).cuda()
+
+ # Plan eval init
+ eval_planning = hasattr(model.module, 'with_planning_head') \
+ and model.module.with_planning_head
+ if eval_planning:
+ planning_metrics = UniADPlanningMetric().cuda()
+
+ bbox_results = []
+ mask_results = []
+ dataset = data_loader.dataset
+ rank, world_size = get_dist_info()
+ if rank == 0:
+ prog_bar = ProgressBar(len(dataset))
+ time.sleep(2) # This line can prevent deadlock problem in some cases.
+ have_mask = False
+ num_occ = 0
+ for i, data in enumerate(data_loader):
+ with torch.no_grad():
+ result = model(data, return_loss=False, rescale=True)
+
+
+ #import pdb;pdb.set_trace()
+
+ # # EVAL planning
+ if eval_planning:
+ # TODO: Wrap below into a func
+ segmentation = result[0]['planning']['planning_gt']['segmentation']
+ sdc_planning = result[0]['planning']['planning_gt']['sdc_planning']
+ sdc_planning_mask = result[0]['planning']['planning_gt']['sdc_planning_mask']
+ pred_sdc_traj = result[0]['planning']['result_planning']['sdc_traj']
+ result[0]['planning_traj'] = result[0]['planning']['result_planning']['sdc_traj']
+ result[0]['planning_traj_gt'] = result[0]['planning']['planning_gt']['sdc_planning']
+ result[0]['command'] = result[0]['planning']['planning_gt']['command']
+ planning_metrics(pred_sdc_traj[:, :6, :2], sdc_planning[0][0,:, :6, :2], sdc_planning_mask[0][0,:, :6, :2], segmentation[0][:, [1,2,3,4,5,6]])
+
+ # # Eval Occ
+ if eval_occ:
+ occ_has_invalid_frame = data['gt_occ_has_invalid_frame'][0]
+ occ_to_eval = not occ_has_invalid_frame.item()
+ if occ_to_eval and 'occ' in result[0].keys():
+ num_occ += 1
+ for key, grid in EVALUATION_RANGES.items():
+ limits = slice(grid[0], grid[1])
+ iou_metrics[key](result[0]['occ']['seg_out'][..., limits, limits].contiguous(),
+ result[0]['occ']['seg_gt'][..., limits, limits].contiguous())
+ panoptic_metrics[key](result[0]['occ']['ins_seg_out'][..., limits, limits].contiguous().detach(),
+ result[0]['occ']['ins_seg_gt'][..., limits, limits].contiguous())
+
+ # Pop out unnecessary occ results, avoid appending it to cpu when collect_results_cpu
+ if os.environ.get('ENABLE_PLOT_MODE', None) is None:
+ result[0].pop('occ', None)
+ result[0].pop('planning', None)
+ else:
+ for k in ['seg_gt', 'ins_seg_gt', 'pred_ins_sigmoid', 'seg_out', 'ins_seg_out']:
+ if k in result[0]['occ']:
+ result[0]['occ'][k] = result[0]['occ'][k].detach().cpu()
+ for k in ['bbox', 'segm', 'labels', 'panoptic', 'drivable', 'score_list', 'lane', 'lane_score', 'stuff_score_list']:
+ if k in result[0]['pts_bbox'] and isinstance(result[0]['pts_bbox'][k], torch.Tensor):
+ result[0]['pts_bbox'][k] = result[0]['pts_bbox'][k].detach().cpu()
+
+ # # encode mask results
+ if isinstance(result, dict):
+ if 'bbox_results' in result.keys():
+ bbox_result = result['bbox_results']
+ batch_size = len(result['bbox_results'])
+ bbox_results.extend(bbox_result)
+ if 'mask_results' in result.keys() and result['mask_results'] is not None:
+ mask_result = custom_encode_mask_results(result['mask_results'])
+ mask_results.extend(mask_result)
+ have_mask = True
+ else:
+ batch_size = len(result)
+ bbox_results.extend(result)
+
+
+ if rank == 0:
+ for _ in range(batch_size * world_size):
+ prog_bar.update()
+
+ # break
+
+ # collect results from all ranks
+ if gpu_collect:
+ bbox_results = collect_results_gpu(bbox_results, len(dataset))
+ if have_mask:
+ mask_results = collect_results_gpu(mask_results, len(dataset))
+ else:
+ mask_results = None
+ else:
+ bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+ tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+ if have_mask:
+ mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+ else:
+ mask_results = None
+
+ if eval_planning:
+ planning_results = planning_metrics.compute()
+ planning_metrics.reset()
+
+ ret_results = dict()
+ ret_results['bbox_results'] = bbox_results
+ if eval_occ:
+ occ_results = {}
+ for key, grid in EVALUATION_RANGES.items():
+ panoptic_scores = panoptic_metrics[key].compute()
+ for panoptic_key, value in panoptic_scores.items():
+ occ_results[f'{panoptic_key}'] = occ_results.get(f'{panoptic_key}', []) + [100 * value[1].item()]
+ panoptic_metrics[key].reset()
+
+ iou_scores = iou_metrics[key].compute()
+ occ_results['iou'] = occ_results.get('iou', []) + [100 * iou_scores[1].item()]
+ iou_metrics[key].reset()
+
+ occ_results['num_occ'] = num_occ # count on one gpu
+ occ_results['ratio_occ'] = num_occ / len(dataset) # count on one gpu, but reflect the relative ratio
+ ret_results['occ_results_computed'] = occ_results
+ if eval_planning:
+ ret_results['planning_results_computed'] = planning_results
+
+ if mask_results is not None:
+ ret_results['mask_results'] = mask_results
+ return ret_results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+ rank, world_size = get_dist_info()
+ # create a tmp dir if it is not specified
+ if tmpdir is None:
+ MAX_LEN = 512
+ # 32 is whitespace
+ dir_tensor = torch.full((MAX_LEN, ),
+ 32,
+ dtype=torch.uint8,
+ device='cuda')
+ if rank == 0:
+ mkdir_or_exist('.dist_test')
+ tmpdir = tempfile.mkdtemp(dir='.dist_test')
+ tmpdir = torch.tensor(
+ bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+ dir_tensor[:len(tmpdir)] = tmpdir
+ dist.broadcast(dir_tensor, 0)
+ tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+ else:
+ mkdir_or_exist(tmpdir)
+ # dump the part result to the dir
+ dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+ dist.barrier()
+ # collect all parts
+ if rank != 0:
+ return None
+ else:
+ # load results of all parts from tmp dir
+ part_list = []
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ part_list.append(load(part_file))
+ # sort the results
+ ordered_results = []
+ '''
+ bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+ '''
+ #for res in zip(*part_list):
+ for res in part_list:
+ ordered_results.extend(list(res))
+ # the dataloader may pad some samples
+ ordered_results = ordered_results[:size]
+ # remove tmp dir
+ shutil.rmtree(tmpdir)
+ return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+ collect_results_cpu(result_part, size)
+
+def custom_single_gpu_test(model,
+ data_loader,
+ show=False,
+ out_dir=None,
+ show_score_thr=0.3):
+ """Test model with single gpu.
+
+ This method tests model with single gpu and gives the 'show' option.
+ By setting ``show=True``, it saves the visualization results under
+ ``out_dir``.
+
+ Args:
+ model (nn.Module): Model to be tested.
+ data_loader (nn.Dataloader): Pytorch data loader.
+ show (bool): Whether to save viualization results.
+ Default: True.
+ out_dir (str): The path to save visualization results.
+ Default: None.
+
+ Returns:
+ list[dict]: The prediction results.
+ """
+ model.eval()
+ results = []
+ dataset = data_loader.dataset
+ prog_bar = ProgressBar(len(dataset))
+ for i, data in enumerate(data_loader):
+ with torch.no_grad():
+ result = model(return_loss=False, rescale=True, **data)
+
+ if show:
+ # Visualize the results of MMDetection3D model
+ # 'show_results' is MMdetection3D visualization API
+ models_3d = (Base3DDetector, Base3DSegmentor,
+ SingleStageMono3DDetector)
+ if isinstance(model.module, models_3d):
+ model.module.show_results(data, result, out_dir=out_dir)
+ # Visualize the results of MMDetection model
+ # 'show_result' is MMdetection visualization API
+ else:
+ batch_size = len(result)
+ if batch_size == 1 and isinstance(data['img'][0],
+ torch.Tensor):
+ img_tensor = data['img'][0]
+ else:
+ img_tensor = data['img'][0].data[0]
+ img_metas = data['img_metas'][0].data[0]
+ imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+ assert len(imgs) == len(img_metas)
+
+ for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
+ h, w, _ = img_meta['img_shape']
+ img_show = img[:h, :w, :]
+
+ ori_h, ori_w = img_meta['ori_shape'][:-1]
+ img_show = imresize(img_show, (ori_w, ori_h))
+
+ if out_dir:
+ out_file = osp.join(out_dir, img_meta['ori_filename'])
+ else:
+ out_file = None
+
+ model.module.show_result(
+ img_show,
+ result[i],
+ show=show,
+ out_file=out_file,
+ score_thr=show_score_thr)
+ results.extend(result)
+
+ batch_size = len(result)
+ for _ in range(batch_size):
+ prog_bar.update()
+ return results
\ No newline at end of file
diff --git a/adzoo/uniad/train.py b/adzoo/uniad/train.py
new file mode 100755
index 0000000..1df7f99
--- /dev/null
+++ b/adzoo/uniad/train.py
@@ -0,0 +1,212 @@
+import argparse
+import torch
+import copy
+import os
+import time
+import warnings
+from os import path as osp
+from mmcv import __version__ as mmcv_version
+from mmcv.datasets import build_dataset
+from mmcv.models import build_model
+from mmcv.utils import collect_env, get_root_logger, mkdir_or_exist, set_random_seed, get_dist_info, init_dist, \
+ Config, DictAction, TORCH_VERSION, digit_version
+from mmcv.datasets.builder import build_dataloader
+from mmcv.optims import build_optimizer
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+from mmcv.core import EvalHook
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+ Fp16OptimizerHook, OptimizerHook, build_runner)
+from adzoo.uniad.test_utils import custom_multi_gpu_test
+
+warnings.filterwarnings("ignore")
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a detector')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--work-dir', help='the dir to save logs and models')
+ parser.add_argument(
+ '--resume-from', help='the checkpoint file to resume from')
+ parser.add_argument(
+ '--no-validate',
+ action='store_true',
+ help='whether not to evaluate the checkpoint during training')
+ group_gpus = parser.add_mutually_exclusive_group()
+ group_gpus.add_argument(
+ '--gpus',
+ type=int,
+ help='number of gpus to use '
+ '(only applicable to non-distributed training)')
+ group_gpus.add_argument(
+ '--gpu-ids',
+ type=int,
+ nargs='+',
+ help='ids of gpus to use '
+ '(only applicable to non-distributed training)')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local-rank', type=int, default=0)
+ parser.add_argument(
+ '--autoscale-lr',
+ action='store_true',
+ help='automatically scale lr with the number of gpus')
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ return args
+
+
+def main():
+ args = parse_args()
+ cfg = Config.fromfile(args.config)
+
+ if args.work_dir is not None:
+ cfg.work_dir = args.work_dir
+ else:
+ cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0])
+
+ # if args.resume_from is not None:
+ if args.resume_from is not None and osp.isfile(args.resume_from):
+ cfg.resume_from = args.resume_from
+
+ if args.gpu_ids is not None:
+ cfg.gpu_ids = args.gpu_ids
+ else:
+ cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+ if args.autoscale_lr:
+ # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+ cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ elif args.launcher == 'pytorch':
+ torch.backends.cudnn.benchmark = True
+ distributed = True
+ init_dist(args.launcher, **cfg.dist_params)
+ rank, world_size = get_dist_info()
+ cfg.gpu_ids = range(world_size)
+
+ # Create work_dir
+ mkdir_or_exist(osp.abspath(cfg.work_dir))
+ cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+ timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+
+ # meta info
+ meta = dict()
+ env_info_dict = collect_env()
+ env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+ dash_line = '-' * 60 + '\n'
+ meta['env_info'] = env_info
+ meta['config'] = cfg.pretty_text
+ meta['seed'] = args.seed
+ meta['exp_name'] = osp.basename(args.config)
+
+ # seed
+ cfg.seed = args.seed
+ set_random_seed(args.seed, deterministic=args.deterministic)
+
+ # logger
+ log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+ logger = get_root_logger(log_file=log_file, log_level=cfg.log_level, name=cfg.model.type)
+ logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line)
+ logger.info(f'Distributed training: {distributed}')
+ logger.info(f'Config:\n{cfg.pretty_text}')
+ logger.info(f'Set random seed to {args.seed}, 'f'deterministic: {args.deterministic}')
+
+ # Dataset
+ datasets = [build_dataset(cfg.data.train)]
+
+ # Save meta info
+ if cfg.checkpoint_config is not None:
+ cfg.checkpoint_config.meta = dict(mmcv_version=mmcv_version, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES, \
+ PALETTE=datasets[0].PALETTE if hasattr(datasets[0], 'PALETTE') else None) # # for segmentors
+
+ # Dataloader
+ datasets = datasets if isinstance(datasets, (list, tuple)) else [datasets]
+ data_loaders = [build_dataloader(ds,
+ cfg.data.samples_per_gpu,
+ cfg.data.workers_per_gpu,
+ # cfg.gpus will be ignored if distributed
+ len(cfg.gpu_ids),
+ dist=distributed,
+ seed=cfg.seed,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ ) for ds in datasets
+ ]
+
+ # Model
+ model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))
+ model.init_weights()
+ model.CLASSES = datasets[0].CLASSES # add an attribute for visualization convenience
+ logger.info(f'Model:\n{model}')
+ if distributed:
+ find_unused_parameters = cfg.get('find_unused_parameters', False)
+ model = DistributedDataParallel(model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters
+ )
+ else:
+ model = DataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+ # Optimizer
+ optimizer = build_optimizer(model, cfg.optimizer)
+ optimizer_config = OptimizerHook(**cfg.optimizer_config)
+
+ # Runner
+ runner = build_runner(cfg.runner, default_args=dict(model=model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+ runner.timestamp = timestamp
+ runner.register_training_hooks(cfg.lr_config, optimizer_config,
+ cfg.checkpoint_config, cfg.log_config,
+ cfg.get('momentum_config', None))
+ if distributed:
+ if isinstance(runner, EpochBasedRunner):
+ runner.register_hook(DistSamplerSeedHook())
+
+ # Register eval hooks for interval eval
+ val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+ if val_samples_per_gpu > 1:
+ assert False
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.val.pipeline = replace_ImageToTensor(
+ cfg.data.val.pipeline)
+ val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+ val_dataloader = build_dataloader(
+ val_dataset,
+ samples_per_gpu=val_samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ )
+ eval_cfg = cfg.get('evaluation', {})
+ eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+ eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+ eval_hook = CustomDistEvalHook if distributed else EvalHook
+ runner.register_hook(eval_hook(val_dataloader, test_fn=custom_multi_gpu_test, **eval_cfg))
+
+ if cfg.resume_from and os.path.exists(cfg.resume_from):
+ runner.resume(cfg.resume_from)
+ elif cfg.load_from:
+ runner.load_checkpoint(cfg.load_from)
+ runner.run(data_loaders, cfg.workflow)
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/uniad/uniad_dist_eval.sh b/adzoo/uniad/uniad_dist_eval.sh
new file mode 100755
index 0000000..12b2720
--- /dev/null
+++ b/adzoo/uniad/uniad_dist_eval.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+T=`date +%m%d%H%M`
+
+# -------------------------------------------------- #
+# Usually you only need to customize these variables #
+CFG=$1 #
+CKPT=$2 #
+GPUS=$3 #
+# -------------------------------------------------- #
+GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
+
+MASTER_PORT=${MASTER_PORT:-12145}
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+# Intermediate files and logs will be saved to UniAD/projects/work_dirs/
+
+if [ ! -d ${WORK_DIR}logs ]; then
+ mkdir -p ${WORK_DIR}logs
+fi
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+ --nproc_per_node=$GPUS_PER_NODE \
+ --master_port=$MASTER_PORT \
+ $(dirname "$0")/test.py \
+ $CFG \
+ $CKPT \
+ --launcher pytorch ${@:4} \
+ --eval bbox \
+ --show-dir ${WORK_DIR} \
+ 2>&1 | tee ${WORK_DIR}logs/eval.$T
\ No newline at end of file
diff --git a/adzoo/uniad/uniad_dist_train.sh b/adzoo/uniad/uniad_dist_train.sh
new file mode 100755
index 0000000..313e20a
--- /dev/null
+++ b/adzoo/uniad/uniad_dist_train.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+T=`date +%m%d%H%M`
+
+# -------------------------------------------------- #
+# Usually you only need to customize these variables #
+CFG=$1 #
+GPUS=$2 #
+# -------------------------------------------------- #
+GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
+NNODES=`expr $GPUS / $GPUS_PER_NODE`
+
+MASTER_PORT=${MASTER_PORT:-54621}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+RANK=${RANK:-0}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+# Intermediate files and logs will be saved to UniAD/projects/work_dirs/
+
+if [ ! -d ${WORK_DIR}logs ]; then
+ mkdir -p ${WORK_DIR}logs
+fi
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+ --nproc_per_node=${GPUS_PER_NODE} \
+ --master_addr=${MASTER_ADDR} \
+ --master_port=${MASTER_PORT} \
+ --nnodes=${NNODES} \
+ --node_rank=${RANK} \
+ $(dirname "$0")/train.py \
+ $CFG \
+ --launcher pytorch ${@:3} \
+ --deterministic \
+ --work-dir ${WORK_DIR} \
+ 2>&1 | tee ${WORK_DIR}logs/train.$T
\ No newline at end of file
diff --git a/adzoo/uniad/uniad_vis_result.sh b/adzoo/uniad/uniad_vis_result.sh
new file mode 100755
index 0000000..b43a1be
--- /dev/null
+++ b/adzoo/uniad/uniad_vis_result.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+python ./tools/analysis_tools/visualize/run.py \
+ --predroot PATH_TO_YOUR_PREDISION_RESULT_PKL \
+ --out_folder PATH_TO_YOUR_OUTPUT_FOLDER \
+ --demo_video FILENAME_OF_OUTPUT_VIDEO \
+ --project_to_cam True
\ No newline at end of file
diff --git a/adzoo/vad/analysis_tools/__init__.py b/adzoo/vad/analysis_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/vad/analysis_tools/analyze_logs.py b/adzoo/vad/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000..806175f
--- /dev/null
+++ b/adzoo/vad/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+ for i, log_dict in enumerate(log_dicts):
+ print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+ all_times = []
+ for epoch in log_dict.keys():
+ if args.include_outliers:
+ all_times.append(log_dict[epoch]['time'])
+ else:
+ all_times.append(log_dict[epoch]['time'][1:])
+ all_times = np.array(all_times)
+ epoch_ave_time = all_times.mean(-1)
+ slowest_epoch = epoch_ave_time.argmax()
+ fastest_epoch = epoch_ave_time.argmin()
+ std_over_epoch = epoch_ave_time.std()
+ print(f'slowest epoch {slowest_epoch + 1}, '
+ f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+ print(f'fastest epoch {fastest_epoch + 1}, '
+ f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+ print(f'time std over epochs is {std_over_epoch:.4f}')
+ print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+ print()
+
+
+def plot_curve(log_dicts, args):
+ if args.backend is not None:
+ plt.switch_backend(args.backend)
+ sns.set_style(args.style)
+ # if legend is None, use {filename}_{key} as legend
+ legend = args.legend
+ if legend is None:
+ legend = []
+ for json_log in args.json_logs:
+ for metric in args.keys:
+ legend.append(f'{json_log}_{metric}')
+ assert len(legend) == (len(args.json_logs) * len(args.keys))
+ metrics = args.keys
+
+ num_metrics = len(metrics)
+ for i, log_dict in enumerate(log_dicts):
+ epochs = list(log_dict.keys())
+ for j, metric in enumerate(metrics):
+ print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+ if metric not in log_dict[epochs[args.interval - 1]]:
+ raise KeyError(
+ f'{args.json_logs[i]} does not contain metric {metric}')
+
+ if args.mode == 'eval':
+ if min(epochs) == args.interval:
+ x0 = args.interval
+ else:
+ # if current training is resumed from previous checkpoint
+ # we lost information in early epochs
+ # `xs` should start according to `min(epochs)`
+ if min(epochs) % args.interval == 0:
+ x0 = min(epochs)
+ else:
+ # find the first epoch that do eval
+ x0 = min(epochs) + args.interval - \
+ min(epochs) % args.interval
+ xs = np.arange(x0, max(epochs) + 1, args.interval)
+ ys = []
+ for epoch in epochs[args.interval - 1::args.interval]:
+ ys += log_dict[epoch][metric]
+
+ # if training is aborted before eval of the last epoch
+ # `xs` and `ys` will have different length and cause an error
+ # check if `ys[-1]` is empty here
+ if not log_dict[epoch][metric]:
+ xs = xs[:-1]
+
+ ax = plt.gca()
+ ax.set_xticks(xs)
+ plt.xlabel('epoch')
+ plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+ else:
+ xs = []
+ ys = []
+ num_iters_per_epoch = \
+ log_dict[epochs[args.interval-1]]['iter'][-1]
+ for epoch in epochs[args.interval - 1::args.interval]:
+ iters = log_dict[epoch]['iter']
+ if log_dict[epoch]['mode'][-1] == 'val':
+ iters = iters[:-1]
+ xs.append(
+ np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+ ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+ xs = np.concatenate(xs)
+ ys = np.concatenate(ys)
+ plt.xlabel('iter')
+ plt.plot(
+ xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+ plt.legend()
+ if args.title is not None:
+ plt.title(args.title)
+ if args.out is None:
+ plt.show()
+ else:
+ print(f'save curve to: {args.out}')
+ plt.savefig(args.out)
+ plt.cla()
+
+
+def add_plot_parser(subparsers):
+ parser_plt = subparsers.add_parser(
+ 'plot_curve', help='parser for plotting curves')
+ parser_plt.add_argument(
+ 'json_logs',
+ type=str,
+ nargs='+',
+ help='path of train log in json format')
+ parser_plt.add_argument(
+ '--keys',
+ type=str,
+ nargs='+',
+ default=['mAP_0.25'],
+ help='the metric that you want to plot')
+ parser_plt.add_argument('--title', type=str, help='title of figure')
+ parser_plt.add_argument(
+ '--legend',
+ type=str,
+ nargs='+',
+ default=None,
+ help='legend of each plot')
+ parser_plt.add_argument(
+ '--backend', type=str, default=None, help='backend of plt')
+ parser_plt.add_argument(
+ '--style', type=str, default='dark', help='style of plt')
+ parser_plt.add_argument('--out', type=str, default=None)
+ parser_plt.add_argument('--mode', type=str, default='train')
+ parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+ parser_time = subparsers.add_parser(
+ 'cal_train_time',
+ help='parser for computing the average time per training iteration')
+ parser_time.add_argument(
+ 'json_logs',
+ type=str,
+ nargs='+',
+ help='path of train log in json format')
+ parser_time.add_argument(
+ '--include-outliers',
+ action='store_true',
+ help='include the first value of every epoch when computing '
+ 'the average time')
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Analyze Json Log')
+ # currently only support plot curve and calculate average train time
+ subparsers = parser.add_subparsers(dest='task', help='task parser')
+ add_plot_parser(subparsers)
+ add_time_parser(subparsers)
+ args = parser.parse_args()
+ return args
+
+
+def load_json_logs(json_logs):
+ # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+ # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+ # value of sub dict is a list of corresponding values of all iterations
+ log_dicts = [dict() for _ in json_logs]
+ for json_log, log_dict in zip(json_logs, log_dicts):
+ with open(json_log, 'r') as log_file:
+ for line in log_file:
+ log = json.loads(line.strip())
+ # skip lines without `epoch` field
+ if 'epoch' not in log:
+ continue
+ epoch = log.pop('epoch')
+ if epoch not in log_dict:
+ log_dict[epoch] = defaultdict(list)
+ for k, v in log.items():
+ log_dict[epoch][k].append(v)
+ return log_dicts
+
+
+def main():
+ args = parse_args()
+
+ json_logs = args.json_logs
+ for json_log in json_logs:
+ assert json_log.endswith('.json')
+
+ log_dicts = load_json_logs(json_logs)
+
+ eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/analysis_tools/benchmark.py b/adzoo/vad/analysis_tools/benchmark.py
new file mode 100644
index 0000000..487a348
--- /dev/null
+++ b/adzoo/vad/analysis_tools/benchmark.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+ parser.add_argument('--samples', default=2000, help='samples to benchmark')
+ parser.add_argument(
+ '--log-interval', default=50, help='interval of logging')
+ parser.add_argument(
+ '--fuse-conv-bn',
+ action='store_true',
+ help='Whether to fuse conv and bn, this will slightly increase'
+ 'the inference speed')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+ cfg.model.pretrained = None
+ cfg.data.test.test_mode = True
+
+ # build the dataloader
+ # TODO: support multiple images per gpu (only minor changes are needed)
+ print(cfg.data.test)
+ dataset = custom_build_dataset(cfg.data.test)
+ data_loader = build_dataloader(
+ dataset,
+ samples_per_gpu=1,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=False,
+ shuffle=False)
+
+ # build the model and load checkpoint
+ cfg.model.train_cfg = None
+ model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(model)
+ if args.checkpoint is not None:
+ load_checkpoint(model, args.checkpoint, map_location='cpu')
+ #if args.fuse_conv_bn:
+ # model = fuse_module(model)
+
+ model = MMDataParallel(model, device_ids=[0])
+
+ model.eval()
+
+ # the first several iterations may be very slow so skip them
+ num_warmup = 5
+ pure_inf_time = 0
+
+ # benchmark with several samples and take the average
+ for i, data in enumerate(data_loader):
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+ with torch.no_grad():
+ model(return_loss=False, rescale=True, **data)
+
+ torch.cuda.synchronize()
+ elapsed = time.perf_counter() - start_time
+
+ if i >= num_warmup:
+ pure_inf_time += elapsed
+ if (i + 1) % args.log_interval == 0:
+ fps = (i + 1 - num_warmup) / pure_inf_time
+ print(f'Done image [{i + 1:<3}/ {args.samples}], '
+ f'fps: {fps:.1f} img / s')
+
+ if (i + 1) == args.samples:
+ pure_inf_time += elapsed
+ fps = (i + 1 - num_warmup) / pure_inf_time
+ print(f'Overall fps: {fps:.1f} img / s')
+ break
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/analysis_tools/get_flops.py b/adzoo/vad/analysis_tools/get_flops.py
new file mode 100644
index 0000000..1b9fb01
--- /dev/null
+++ b/adzoo/vad/analysis_tools/get_flops.py
@@ -0,0 +1,747 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import argparse
+
+import torch
+from mmcv import Config, DictAction
+
+from mmdet3d.models import build_model
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+
+# try:
+# from mmcv.cnn import get_model_complexity_info
+# except ImportError:
+# raise ImportError('Please upgrade mmcv to >0.6.2')
+
+import sys
+sys.path.append('.')
+
+
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+import mmcv
+
+
+def get_model_complexity_info(model,
+ data,
+ input_shape=(1280, 720),
+ print_per_layer_stat=True,
+ as_strings=True,
+ input_constructor=None,
+ flush=False,
+ ost=sys.stdout):
+ """Get complexity information of a model.
+
+ This method can calculate FLOPs and parameter counts of a model with
+ corresponding input shape. It can also print complexity information for
+ each layer in a model.
+
+ Supported layers are listed as below:
+ - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+ - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``,
+ ``nn.ReLU6``.
+ - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+ ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+ ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+ ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+ ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+ - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+ ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+ ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+ - Linear: ``nn.Linear``.
+ - Deconvolution: ``nn.ConvTranspose2d``.
+ - Upsample: ``nn.Upsample``.
+
+ Args:
+ model (nn.Module): The model for complexity calculation.
+ input_shape (tuple): Input shape used for calculation.
+ print_per_layer_stat (bool): Whether to print complexity information
+ for each layer in a model. Default: True.
+ as_strings (bool): Output FLOPs and params counts in a string form.
+ Default: True.
+ input_constructor (None | callable): If specified, it takes a callable
+ method that generates input. otherwise, it will generate a random
+ tensor with input shape to calculate FLOPs. Default: None.
+ flush (bool): same as that in :func:`print`. Default: False.
+ ost (stream): same as ``file`` param in :func:`print`.
+ Default: sys.stdout.
+
+ Returns:
+ tuple[float | str]: If ``as_strings`` is set to True, it will return
+ FLOPs and parameter counts in a string format. otherwise, it will
+ return those in a float number format.
+ """
+
+ assert isinstance(model, nn.Module)
+ flops_model = add_flops_counting_methods(model)
+ flops_model.eval()
+ flops_model.start_flops_count()
+ if input_constructor:
+ input = input_constructor(input_shape)
+ _ = flops_model(**input)
+ else:
+ try:
+ batch = torch.ones(()).new_empty(
+ (1, 6, 3, *input_shape),
+ dtype=next(flops_model.parameters()).dtype,
+ device=next(flops_model.parameters()).device)
+ except StopIteration:
+ # Avoid StopIteration for models which have no parameters,
+ # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+ batch = torch.ones(()).new_empty((1, 6, 3, *input_shape))
+
+ # img_metas = [data['img_metas'][0].data[0]]
+ # img = data['img'][0].data[0]
+ # points = data['points'][0].data[0][0]
+ # fut_valid_flag = data['fut_valid_flag'][0].data[0]
+ # img = img.to(batch.device)
+ # points = [points.to(batch.device)]
+ # ego_his_trajs = data['ego_his_trajs'][0].data[0].to(batch.device)
+ # ego_lcf_feat = data['ego_lcf_feat'][0].data[0].to(batch.device).unsqueeze(0)
+
+ # _ = flops_model(rescale=True, img=img, img_metas=img_metas, points=points,
+ # fut_valid_flag=fut_valid_flag, ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat)
+
+ img_metas = [data['img_metas'][0].data[0]]
+ img = data['img'][0].data[0]
+ img = img.to(batch.device)
+
+ _ = flops_model(rescale=True, img=img, img_metas=img_metas)
+
+ flops_count, params_count = flops_model.compute_average_flops_cost()
+ if print_per_layer_stat:
+ print_model_with_flops(
+ flops_model, flops_count, params_count, ost=ost, flush=flush)
+ flops_model.stop_flops_count()
+
+ if as_strings:
+ return flops_to_string(flops_count), params_to_string(params_count)
+
+ return flops_count, params_count
+
+
+def flops_to_string(flops, units='GFLOPs', precision=2):
+ """Convert FLOPs number into a string.
+
+ Note that Here we take a multiply-add counts as one FLOP.
+
+ Args:
+ flops (float): FLOPs number to be converted.
+ units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+ 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+ choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+ precision (int): Digit number after the decimal point. Default: 2.
+
+ Returns:
+ str: The converted FLOPs number with units.
+
+ Examples:
+ >>> flops_to_string(1e9)
+ '1.0 GFLOPs'
+ >>> flops_to_string(2e5, 'MFLOPs')
+ '0.2 MFLOPs'
+ >>> flops_to_string(3e-9, None)
+ '3e-09 FLOPs'
+ """
+ if units is None:
+ if flops // 10**9 > 0:
+ return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+ elif flops // 10**6 > 0:
+ return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+ elif flops // 10**3 > 0:
+ return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+ else:
+ return str(flops) + ' FLOPs'
+ else:
+ if units == 'GFLOPs':
+ return str(round(flops / 10.**9, precision)) + ' ' + units
+ elif units == 'MFLOPs':
+ return str(round(flops / 10.**6, precision)) + ' ' + units
+ elif units == 'KFLOPs':
+ return str(round(flops / 10.**3, precision)) + ' ' + units
+ else:
+ return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params, units=None, precision=2):
+ """Convert parameter number into a string.
+
+ Args:
+ num_params (float): Parameter number to be converted.
+ units (str | None): Converted FLOPs units. Options are None, 'M',
+ 'K' and ''. If set to None, it will automatically choose the most
+ suitable unit for Parameter number. Default: None.
+ precision (int): Digit number after the decimal point. Default: 2.
+
+ Returns:
+ str: The converted parameter number with units.
+
+ Examples:
+ >>> params_to_string(1e9)
+ '1000.0 M'
+ >>> params_to_string(2e5)
+ '200.0 k'
+ >>> params_to_string(3e-9)
+ '3e-09'
+ """
+ if units is None:
+ if num_params // 10**6 > 0:
+ return str(round(num_params / 10**6, precision)) + ' M'
+ elif num_params // 10**3:
+ return str(round(num_params / 10**3, precision)) + ' k'
+ else:
+ return str(num_params)
+ else:
+ if units == 'M':
+ return str(round(num_params / 10.**6, precision)) + ' ' + units
+ elif units == 'K':
+ return str(round(num_params / 10.**3, precision)) + ' ' + units
+ else:
+ return str(num_params)
+
+
+def print_model_with_flops(model,
+ total_flops,
+ total_params,
+ units='GFLOPs',
+ precision=3,
+ ost=sys.stdout,
+ flush=False):
+ """Print a model with FLOPs for each layer.
+
+ Args:
+ model (nn.Module): The model to be printed.
+ total_flops (float): Total FLOPs of the model.
+ total_params (float): Total parameter counts of the model.
+ units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+ precision (int): Digit number after the decimal point. Default: 3.
+ ost (stream): same as `file` param in :func:`print`.
+ Default: sys.stdout.
+ flush (bool): same as that in :func:`print`. Default: False.
+
+ Example:
+ >>> class ExampleModel(nn.Module):
+
+ >>> def __init__(self):
+ >>> super().__init__()
+ >>> self.conv1 = nn.Conv2d(3, 8, 3)
+ >>> self.conv2 = nn.Conv2d(8, 256, 3)
+ >>> self.conv3 = nn.Conv2d(256, 8, 3)
+ >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+ >>> self.flatten = nn.Flatten()
+ >>> self.fc = nn.Linear(8, 1)
+
+ >>> def forward(self, x):
+ >>> x = self.conv1(x)
+ >>> x = self.conv2(x)
+ >>> x = self.conv3(x)
+ >>> x = self.avg_pool(x)
+ >>> x = self.flatten(x)
+ >>> x = self.fc(x)
+ >>> return x
+
+ >>> model = ExampleModel()
+ >>> x = (3, 16, 16)
+ to print the complexity information state for each layer, you can use
+ >>> get_model_complexity_info(model, x)
+ or directly use
+ >>> print_model_with_flops(model, 4579784.0, 37361)
+ ExampleModel(
+ 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+ (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501
+ (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+ (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+ (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+ (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+ (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+ )
+ """
+
+ def accumulate_params(self):
+ if is_supported_instance(self):
+ return self.__params__
+ else:
+ sum = 0
+ for m in self.children():
+ sum += m.accumulate_params()
+ return sum
+
+ def accumulate_flops(self):
+ if is_supported_instance(self):
+ return self.__flops__ / model.__batch_counter__
+ else:
+ sum = 0
+ for m in self.children():
+ sum += m.accumulate_flops()
+ return sum
+
+ def flops_repr(self):
+ accumulated_num_params = self.accumulate_params()
+ accumulated_flops_cost = self.accumulate_flops()
+ return ', '.join([
+ params_to_string(
+ accumulated_num_params, units='M', precision=precision),
+ '{:.3%} Params'.format(accumulated_num_params / total_params),
+ flops_to_string(
+ accumulated_flops_cost, units=units, precision=precision),
+ '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
+ self.original_extra_repr()
+ ])
+
+ def add_extra_repr(m):
+ m.accumulate_flops = accumulate_flops.__get__(m)
+ m.accumulate_params = accumulate_params.__get__(m)
+ flops_extra_repr = flops_repr.__get__(m)
+ if m.extra_repr != flops_extra_repr:
+ m.original_extra_repr = m.extra_repr
+ m.extra_repr = flops_extra_repr
+ assert m.extra_repr != m.original_extra_repr
+
+ def del_extra_repr(m):
+ if hasattr(m, 'original_extra_repr'):
+ m.extra_repr = m.original_extra_repr
+ del m.original_extra_repr
+ if hasattr(m, 'accumulate_flops'):
+ del m.accumulate_flops
+
+ model.apply(add_extra_repr)
+ print(model, file=ost, flush=flush)
+ model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model):
+ """Calculate parameter number of a model.
+
+ Args:
+ model (nn.module): The model for parameter number calculation.
+
+ Returns:
+ float: Parameter number of the model.
+ """
+ num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ return num_params
+
+
+def add_flops_counting_methods(net_main_module):
+ # adding additional methods to the existing module object,
+ # this is done this way so that each function has access to self object
+ net_main_module.start_flops_count = start_flops_count.__get__(
+ net_main_module)
+ net_main_module.stop_flops_count = stop_flops_count.__get__(
+ net_main_module)
+ net_main_module.reset_flops_count = reset_flops_count.__get__(
+ net_main_module)
+ net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # noqa: E501
+ net_main_module)
+
+ net_main_module.reset_flops_count()
+
+ return net_main_module
+
+
+def compute_average_flops_cost(self):
+ """Compute average FLOPs cost.
+
+ A method to compute average FLOPs cost, which will be available after
+ `add_flops_counting_methods()` is called on a desired net object.
+
+ Returns:
+ float: Current mean flops consumption per image.
+ """
+ batches_count = self.__batch_counter__
+ flops_sum = 0
+ for module in self.modules():
+ if is_supported_instance(module):
+ flops_sum += module.__flops__
+ params_sum = get_model_parameters_number(self)
+ return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self):
+ """Activate the computation of mean flops consumption per image.
+
+ A method to activate the computation of mean flops consumption per image.
+ which will be available after ``add_flops_counting_methods()`` is called on
+ a desired net object. It should be called before running the network.
+ """
+ add_batch_counter_hook_function(self)
+
+ def add_flops_counter_hook_function(module):
+ if is_supported_instance(module):
+ if hasattr(module, '__flops_handle__'):
+ return
+
+ else:
+ handle = module.register_forward_hook(
+ get_modules_mapping()[type(module)])
+
+ module.__flops_handle__ = handle
+
+ self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self):
+ """Stop computing the mean flops consumption per image.
+
+ A method to stop computing the mean flops consumption per image, which will
+ be available after ``add_flops_counting_methods()`` is called on a desired
+ net object. It can be called to pause the computation whenever.
+ """
+ remove_batch_counter_hook_function(self)
+ self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self):
+ """Reset statistics computed so far.
+
+ A method to Reset computed statistics, which will be available after
+ `add_flops_counting_methods()` is called on a desired net object.
+ """
+ add_batch_counter_variables_or_reset(self)
+ self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module, input, output):
+ module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module, input, output):
+ output_size = output[0]
+ batch_size = output_size.shape[0]
+ output_elements_count = batch_size
+ for val in output_size.shape[1:]:
+ output_elements_count *= val
+ module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module, input, output):
+ active_elements_count = output.numel()
+ module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module, input, output):
+ input = input[0]
+ output_last_dim = output.shape[
+ -1] # pytorch checks dimensions, so here we don't care much
+ module.__flops__ += int(np.prod(input.shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module, input, output):
+ input = input[0]
+ module.__flops__ += int(np.prod(input.shape))
+
+
+def norm_flops_counter_hook(module, input, output):
+ input = input[0]
+
+ batch_flops = np.prod(input.shape)
+ if (getattr(module, 'affine', False)
+ or getattr(module, 'elementwise_affine', False)):
+ batch_flops *= 2
+ module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module, input, output):
+ # Can have multiple inputs, getting the first one
+ input = input[0]
+
+ batch_size = input.shape[0]
+ input_height, input_width = input.shape[2:]
+
+ kernel_height, kernel_width = conv_module.kernel_size
+ in_channels = conv_module.in_channels
+ out_channels = conv_module.out_channels
+ groups = conv_module.groups
+
+ filters_per_channel = out_channels // groups
+ conv_per_position_flops = (
+ kernel_height * kernel_width * in_channels * filters_per_channel)
+
+ active_elements_count = batch_size * input_height * input_width
+ overall_conv_flops = conv_per_position_flops * active_elements_count
+ bias_flops = 0
+ if conv_module.bias is not None:
+ output_height, output_width = output.shape[2:]
+ bias_flops = out_channels * batch_size * output_height * output_height
+ overall_flops = overall_conv_flops + bias_flops
+
+ conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module, input, output):
+ # Can have multiple inputs, getting the first one
+ input = input[0]
+
+ batch_size = input.shape[0]
+ output_dims = list(output.shape[2:])
+
+ kernel_dims = list(conv_module.kernel_size)
+ in_channels = conv_module.in_channels
+ out_channels = conv_module.out_channels
+ groups = conv_module.groups
+
+ filters_per_channel = out_channels // groups
+ conv_per_position_flops = int(
+ np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+ active_elements_count = batch_size * int(np.prod(output_dims))
+
+ overall_conv_flops = conv_per_position_flops * active_elements_count
+
+ bias_flops = 0
+
+ if conv_module.bias is not None:
+
+ bias_flops = out_channels * active_elements_count
+
+ overall_flops = overall_conv_flops + bias_flops
+
+ conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module, input, output):
+ batch_size = 1
+ if len(input) > 0:
+ # Can have multiple inputs, getting the first one
+ input = input[0]
+ batch_size = len(input)
+ else:
+ pass
+ print('Warning! No positional inputs found for a module, '
+ 'assuming batch size is 1.')
+ module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module):
+
+ module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module):
+ if hasattr(module, '__batch_counter_handle__'):
+ return
+
+ handle = module.register_forward_hook(batch_counter_hook)
+ module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module):
+ if hasattr(module, '__batch_counter_handle__'):
+ module.__batch_counter_handle__.remove()
+ del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module):
+ if is_supported_instance(module):
+ if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+ print('Warning: variables __flops__ or __params__ are already '
+ 'defined for the module' + type(module).__name__ +
+ ' ptflops can affect your code!')
+ module.__flops__ = 0
+ module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module):
+ if type(module) in get_modules_mapping():
+ return True
+ return False
+
+
+def remove_flops_counter_hook_function(module):
+ if is_supported_instance(module):
+ if hasattr(module, '__flops_handle__'):
+ module.__flops_handle__.remove()
+ del module.__flops_handle__
+
+
+def get_modules_mapping():
+ return {
+ # convolutions
+ nn.Conv1d: conv_flops_counter_hook,
+ nn.Conv2d: conv_flops_counter_hook,
+ mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
+ nn.Conv3d: conv_flops_counter_hook,
+ mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
+ # activations
+ nn.ReLU: relu_flops_counter_hook,
+ nn.PReLU: relu_flops_counter_hook,
+ nn.ELU: relu_flops_counter_hook,
+ nn.LeakyReLU: relu_flops_counter_hook,
+ nn.ReLU6: relu_flops_counter_hook,
+ # poolings
+ nn.MaxPool1d: pool_flops_counter_hook,
+ nn.AvgPool1d: pool_flops_counter_hook,
+ nn.AvgPool2d: pool_flops_counter_hook,
+ nn.MaxPool2d: pool_flops_counter_hook,
+ mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
+ nn.MaxPool3d: pool_flops_counter_hook,
+ mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
+ nn.AvgPool3d: pool_flops_counter_hook,
+ nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+ nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+ nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+ nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+ nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+ nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+ # normalizations
+ nn.BatchNorm1d: norm_flops_counter_hook,
+ nn.BatchNorm2d: norm_flops_counter_hook,
+ nn.BatchNorm3d: norm_flops_counter_hook,
+ nn.GroupNorm: norm_flops_counter_hook,
+ nn.InstanceNorm1d: norm_flops_counter_hook,
+ nn.InstanceNorm2d: norm_flops_counter_hook,
+ nn.InstanceNorm3d: norm_flops_counter_hook,
+ nn.LayerNorm: norm_flops_counter_hook,
+ # FC
+ nn.Linear: linear_flops_counter_hook,
+ mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
+ # Upscale
+ nn.Upsample: upsample_flops_counter_hook,
+ # Deconvolution
+ nn.ConvTranspose2d: deconv_flops_counter_hook,
+ mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
+ }
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a detector')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument(
+ '--shape',
+ type=int,
+ nargs='+',
+ default=[40000, 4],
+ help='input point cloud size')
+ parser.add_argument(
+ '--modality',
+ type=str,
+ default='point',
+ choices=['point', 'image', 'multi'],
+ help='input data modality')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+
+ args = parse_args()
+
+ if args.modality == 'point':
+ assert len(args.shape) == 2, 'invalid input shape'
+ input_shape = tuple(args.shape)
+ elif args.modality == 'image':
+ if len(args.shape) == 1:
+ input_shape = (3, args.shape[0], args.shape[0])
+ elif len(args.shape) == 2:
+ input_shape = (3, ) + tuple(args.shape)
+ else:
+ raise ValueError('invalid input shape')
+ elif args.modality == 'multi':
+ raise NotImplementedError(
+ 'FLOPs counter is currently not supported for models with '
+ 'multi-modality input')
+
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+
+ if hasattr(cfg, 'plugin'):
+ if cfg.plugin:
+ import importlib
+ if hasattr(cfg, 'plugin_dir'):
+ plugin_dir = cfg.plugin_dir
+ _module_dir = os.path.dirname(plugin_dir)
+ _module_dir = _module_dir.split('/')
+ _module_path = _module_dir[0]
+
+ for m in _module_dir[1:]:
+ _module_path = _module_path + '.' + m
+ print(_module_path)
+ plg_lib = importlib.import_module(_module_path)
+ else:
+ # import dir is the dirpath for the config file
+ _module_dir = os.path.dirname(args.config)
+ _module_dir = _module_dir.split('/')
+ _module_path = _module_dir[0]
+ for m in _module_dir[1:]:
+ _module_path = _module_path + '.' + m
+ print(_module_path)
+ plg_lib = importlib.import_module(_module_path)
+
+ samples_per_gpu = 1
+ from mmdet.datasets import replace_ImageToTensor
+ if isinstance(cfg.data.test, dict):
+ cfg.data.test.test_mode = True
+ samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+ if samples_per_gpu > 1:
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.test.pipeline = replace_ImageToTensor(
+ cfg.data.test.pipeline)
+ elif isinstance(cfg.data.test, list):
+ for ds_cfg in cfg.data.test:
+ ds_cfg.test_mode = True
+ samples_per_gpu = max(
+ [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+ if samples_per_gpu > 1:
+ for ds_cfg in cfg.data.test:
+ ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+ dataset = build_dataset(cfg.data.test)
+ dataset.is_vis_on_test = True #TODO, this is a hack
+ data_loader = build_dataloader(
+ dataset,
+ samples_per_gpu=1,
+ workers_per_gpu=0,
+ dist=False,
+ shuffle=False,
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+ )
+ for i, data in enumerate(data_loader):
+ # if ~(data['map_gt_labels_3d'].data[0][0] != -1).any():
+ # continue
+ img = data['img'][0].data[0]
+ img_metas = data['img_metas'][0].data[0]
+ break
+
+ model = build_model(
+ cfg.model,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+ if torch.cuda.is_available():
+ model.cuda()
+ model.eval()
+
+ if hasattr(model, 'forward_dummy'):
+ model.forward = model.forward_dummy
+ else:
+ raise NotImplementedError(
+ 'FLOPs counter is currently not supported for {}'.format(
+ model.__class__.__name__))
+
+ flops, params = get_model_complexity_info(model, data)
+ split_line = '=' * 30
+ print(f'{split_line}\nInput shape: {input_shape}\n'
+ f'Flops: {flops}\nParams: {params}\n{split_line}')
+ print('!!!Please be cautious if you use the results in papers. '
+ 'You may need to check if all ops are supported and verify that the '
+ 'flops computation is correct.')
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/adzoo/vad/analysis_tools/get_params.py b/adzoo/vad/analysis_tools/get_params.py
new file mode 100644
index 0000000..6bf4ecf
--- /dev/null
+++ b/adzoo/vad/analysis_tools/get_params.py
@@ -0,0 +1,8 @@
+import torch
+YOUR_CKPT_PATH = None
+file_path = YOUR_CKPT_PATH
+model = torch.load(file_path, map_location='cpu')
+all = 0
+for key in list(model['state_dict'].keys()):
+ all += model['state_dict'][key].nelement()
+print(all)
diff --git a/adzoo/vad/analysis_tools/visualization.py b/adzoo/vad/analysis_tools/visualization.py
new file mode 100644
index 0000000..7fb9776
--- /dev/null
+++ b/adzoo/vad/analysis_tools/visualization.py
@@ -0,0 +1,911 @@
+import sys
+sys.path.append('')
+import os
+import argparse
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+from typing import List, Dict
+
+import cv2
+import mmcv
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from mmdet.datasets.pipelines import to_tensor
+from matplotlib.collections import LineCollection
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility
+
+from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox, CustomDetectionBox, color_map
+from projects.mmdet3d_plugin.datasets.nuscenes_vad_dataset import VectorizedLocalMap, LiDARInstanceLines
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+
+def render_annotation(
+ anntoken: str,
+ margin: float = 10,
+ view: np.ndarray = np.eye(4),
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ out_path: str = 'render.png',
+ extra_info: bool = False) -> None:
+ """
+ Render selected annotation.
+ :param anntoken: Sample_annotation token.
+ :param margin: How many meters in each direction to include in LIDAR view.
+ :param view: LIDAR view point.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param out_path: Optional path to save the rendered figure to disk.
+ :param extra_info: Whether to render extra information below camera view.
+ """
+ ann_record = nusc.get('sample_annotation', anntoken)
+ sample_record = nusc.get('sample', ann_record['sample_token'])
+ assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+ # Figure out which camera the object is fully visible in (this may return nothing).
+ boxes, cam = [], []
+ cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+ all_bboxes = []
+ select_cams = []
+ for cam in cams:
+ _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+ selected_anntokens=[anntoken])
+ if len(boxes) > 0:
+ all_bboxes.append(boxes)
+ select_cams.append(cam)
+ # We found an image that matches. Let's abort.
+ # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+ # 'Try using e.g. BoxVisibility.ANY.'
+ # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+ num_cam = len(all_bboxes)
+
+ fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+ select_cams = [sample_record['data'][cam] for cam in select_cams]
+ print('bbox in cams:', select_cams)
+ # Plot LIDAR view.
+ lidar = sample_record['data']['LIDAR_TOP']
+ data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+ LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+ for box in boxes:
+ c = np.array(get_color(box.name)) / 255.0
+ box.render(axes[0], view=view, colors=(c, c, c))
+ corners = view_points(boxes[0].corners(), view, False)[:2, :]
+ axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+ axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+ axes[0].axis('off')
+ axes[0].set_aspect('equal')
+
+ # Plot CAMERA view.
+ for i in range(1, num_cam + 1):
+ cam = select_cams[i - 1]
+ data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+ im = Image.open(data_path)
+ axes[i].imshow(im)
+ axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+ axes[i].axis('off')
+ axes[i].set_aspect('equal')
+ for box in boxes:
+ c = np.array(get_color(box.name)) / 255.0
+ box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+ # Print extra information about the annotation below the camera view.
+ axes[i].set_xlim(0, im.size[0])
+ axes[i].set_ylim(im.size[1], 0)
+
+ if extra_info:
+ rcParams['font.family'] = 'monospace'
+
+ w, l, h = ann_record['size']
+ category = ann_record['category_name']
+ lidar_points = ann_record['num_lidar_pts']
+ radar_points = ann_record['num_radar_pts']
+
+ sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+ pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+ dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+ information = ' \n'.join(['category: {}'.format(category),
+ '',
+ '# lidar points: {0:>4}'.format(lidar_points),
+ '# radar points: {0:>4}'.format(radar_points),
+ '',
+ 'distance: {:>7.3f}m'.format(dist),
+ '',
+ 'width: {:>7.3f}m'.format(w),
+ 'length: {:>7.3f}m'.format(l),
+ 'height: {:>7.3f}m'.format(h)])
+
+ plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+ if out_path is not None:
+ plt.savefig(out_path)
+
+
+def get_sample_data(sample_data_token: str,
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ selected_anntokens=None,
+ use_flat_vehicle_coordinates: bool = False):
+ """
+ Returns the data path as well as all annotations related to that sample_data.
+ Note that the boxes are transformed into the current sensor's coordinate frame.
+ :param sample_data_token: Sample_data token.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param selected_anntokens: If provided only return the selected annotation.
+ :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+ aligned to z-plane in the world.
+ :return: (data_path, boxes, camera_intrinsic )
+ """
+
+ # Retrieve sensor & pose records
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+ data_path = nusc.get_sample_data_path(sample_data_token)
+
+ if sensor_record['modality'] == 'camera':
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ else:
+ cam_intrinsic = None
+ imsize = None
+
+ # Retrieve all sample annotations and map to sensor coordinate system.
+ if selected_anntokens is not None:
+ boxes = list(map(nusc.get_box, selected_anntokens))
+ else:
+ boxes = nusc.get_boxes(sample_data_token)
+
+ # Make list of Box objects including coord system transforms.
+ box_list = []
+ for box in boxes:
+ if use_flat_vehicle_coordinates:
+ # Move box to ego vehicle coord system parallel to world z plane.
+ yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+ else:
+ # Move box to ego vehicle coord system.
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if sensor_record['modality'] == 'camera' and not \
+ box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+ continue
+
+ box_list.append(box)
+
+ return data_path, box_list, cam_intrinsic
+
+
+def get_predicted_data(sample_data_token: str,
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ selected_anntokens=None,
+ use_flat_vehicle_coordinates: bool = False,
+ pred_anns=None
+ ):
+ """
+ Returns the data path as well as all annotations related to that sample_data.
+ Note that the boxes are transformed into the current sensor's coordinate frame.
+ :param sample_data_token: Sample_data token.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param selected_anntokens: If provided only return the selected annotation.
+ :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+ aligned to z-plane in the world.
+ :return: (data_path, boxes, camera_intrinsic )
+ """
+
+ # Retrieve sensor & pose records
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+ data_path = nusc.get_sample_data_path(sample_data_token)
+
+ if sensor_record['modality'] == 'camera':
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ else:
+ cam_intrinsic = None
+ imsize = None
+
+ # Retrieve all sample annotations and map to sensor coordinate system.
+ # if selected_anntokens is not None:
+ # boxes = list(map(nusc.get_box, selected_anntokens))
+ # else:
+ # boxes = nusc.get_boxes(sample_data_token)
+ boxes = pred_anns
+ # Make list of Box objects including coord system transforms.
+ box_list = []
+ for box in boxes:
+ if use_flat_vehicle_coordinates:
+ # Move box to ego vehicle coord system parallel to world z plane.
+ yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+ else:
+ # Move box to ego vehicle coord system.
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if sensor_record['modality'] == 'camera' and not \
+ box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+ continue
+ box_list.append(box)
+
+ return data_path, box_list, cam_intrinsic
+
+
+def lidiar_render(sample_token, data, out_path=None, out_name=None, traj_use_perstep_offset=True):
+ bbox_gt_list = []
+ bbox_pred_list = []
+ sample_rec = nusc.get('sample', sample_token)
+ anns = sample_rec['anns']
+ sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+ for ann in anns:
+ content = nusc.get('sample_annotation', ann)
+ gt_fut_trajs, gt_fut_masks = get_gt_fut_trajs(
+ nusc=nusc, anno=content, cs_record=cs_record,
+ pose_record=pose_record, fut_ts=6
+ )
+ try:
+ bbox_gt_list.append(CustomDetectionBox(
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=nusc.box_velocity(content['token'])[:2],
+ fut_trajs=tuple(gt_fut_trajs),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=category_to_detection_name(content['category_name']),
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=''))
+ except:
+ pass
+
+ bbox_anns = data['results'][sample_token]
+ for content in bbox_anns:
+ bbox_pred_list.append(CustomDetectionBox(
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ fut_trajs=tuple(content['fut_traj']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name']))
+ gt_annotations = EvalBoxes()
+ pred_annotations = EvalBoxes()
+ gt_annotations.add_boxes(sample_token, bbox_gt_list)
+ pred_annotations.add_boxes(sample_token, bbox_pred_list)
+ # print('green is ground truth')
+ # print('blue is the predited result')
+ visualize_sample(nusc, sample_token, gt_annotations, pred_annotations,
+ savepath=out_path, traj_use_perstep_offset=traj_use_perstep_offset, pred_data=data)
+
+
+def get_color(category_name: str):
+ """
+ Provides the default colors based on the category names.
+ This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+ """
+ a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+ 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+ 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+ 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+ 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+ 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+ 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+ 'vehicle.ego']
+ class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+ ]
+ #print(category_name)
+ if category_name == 'bicycle':
+ return nusc.colormap['vehicle.bicycle']
+ elif category_name == 'construction_vehicle':
+ return nusc.colormap['vehicle.construction']
+ elif category_name == 'traffic_cone':
+ return nusc.colormap['movable_object.trafficcone']
+
+ for key in nusc.colormap.keys():
+ if category_name in key:
+ return nusc.colormap[key]
+ return [0, 0, 0]
+
+# TODO: whether to rotate traj
+def boxes_to_sensor(boxes: List[EvalBox], pose_record: Dict, cs_record: Dict):
+ """
+ Map boxes from global coordinates to the vehicle's sensor coordinate system.
+ :param boxes: The boxes in global coordinates.
+ :param pose_record: The pose record of the vehicle at the current timestamp.
+ :param cs_record: The calibrated sensor record of the sensor.
+ :return: The transformed boxes.
+ """
+ boxes_out = []
+ for box in boxes:
+ # Create Box instance.
+ box = CustomNuscenesBox(
+ box.translation, box.size, Quaternion(box.rotation), box.fut_trajs, name=box.detection_name
+ )
+ # Move box to ego vehicle coord system.
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(pose_record['rotation']).inverse)
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ boxes_out.append(box)
+
+ return boxes_out
+
+
+def get_gt_fut_trajs(nusc: NuScenes,
+ anno,
+ cs_record,
+ pose_record,
+ fut_ts) -> None:
+ """
+ Visualizes a sample from BEV with annotations and detection results.
+ :param nusc: NuScenes object.
+ """
+ box = Box(anno['translation'], anno['size'], Quaternion(anno['rotation']))
+ # Move box to ego vehicle coord system.
+ box.translate(-np.array(pose_record['translation']))
+ box.rotate(Quaternion(pose_record['rotation']).inverse)
+ # Move box to sensor coord system.
+ box.translate(-np.array(cs_record['translation']))
+ box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ # get future trajectory coords for each box
+ gt_fut_trajs = np.zeros((fut_ts, 2)) # [fut_ts*2]
+ gt_fut_masks = np.zeros((fut_ts)) # [fut_ts]
+ gt_fut_trajs[:] = box.center[:2]
+ cur_box = box
+ cur_anno = anno
+ for i in range(fut_ts):
+ if cur_anno['next'] != '':
+ anno_next = nusc.get('sample_annotation', cur_anno['next'])
+ box_next = Box(
+ anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+ )
+ # Move box to ego vehicle coord system.
+ box_next.translate(-np.array(pose_record['translation']))
+ box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+ # Move box to sensor coord system.
+ box_next.translate(-np.array(cs_record['translation']))
+ box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+ # gt_fut_trajs[i] = box_next.center[:2]
+ gt_fut_trajs[i] = box_next.center[:2] - cur_box.center[:2]
+ gt_fut_masks[i] = 1
+ cur_anno = anno_next
+ cur_box = box_next
+ else:
+ # gt_fut_trajs[i:] = gt_fut_trajs[i-1]
+ gt_fut_trajs[i:] = 0
+ break
+
+ return gt_fut_trajs.reshape(-1).tolist(), gt_fut_masks.reshape(-1).tolist()
+
+def get_gt_vec_maps(
+ sample_token,
+ data_root='data/nuscenes/',
+ pc_range=[-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+ padding_value=-10000,
+ map_classes=['divider', 'ped_crossing', 'boundary'],
+ map_fixed_ptsnum_per_line=20
+) -> None:
+ """
+ Get gt vec map for a given sample.
+ """
+ sample_rec = nusc.get('sample', sample_token)
+ sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ lidar2ego_translation = cs_record['translation'],
+ lidar2ego_rotation = cs_record['rotation'],
+ ego2global_translation = pose_record['translation'],
+ ego2global_rotation = pose_record['rotation'],
+ map_location = nusc.get('log', nusc.get('scene', sample_rec['scene_token'])['log_token'])['location']
+
+ lidar2ego = np.eye(4)
+ lidar2ego[:3,:3] = Quaternion(cs_record['rotation']).rotation_matrix
+ lidar2ego[:3, 3] = cs_record['translation']
+ ego2global = np.eye(4)
+ ego2global[:3,:3] = Quaternion(pose_record['rotation']).rotation_matrix
+ ego2global[:3, 3] = pose_record['translation']
+ lidar2global = ego2global @ lidar2ego
+ lidar2global_translation = list(lidar2global[:3,3])
+ lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+ patch_h = pc_range[4]-pc_range[1]
+ patch_w = pc_range[3]-pc_range[0]
+ patch_size = (patch_h, patch_w)
+
+ vector_map = VectorizedLocalMap(data_root, patch_size=patch_size,
+ map_classes=map_classes,
+ fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+ padding_value=padding_value)
+
+
+ anns_results = vector_map.gen_vectorized_samples(
+ map_location, lidar2global_translation, lidar2global_rotation
+ )
+
+ '''
+ anns_results, type: dict
+ 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+ 'gt_vecs_pts_num': list[num_vecs], vec with num_points
+ 'gt_vecs_label': list[num_vecs], vec with cls index
+ '''
+ gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+ if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+ gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+ else:
+ gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+ try:
+ gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+ except:
+ gt_vecs_pts_loc = gt_vecs_pts_loc
+
+ return gt_vecs_pts_loc, gt_vecs_label
+
+
+def visualize_sample(nusc: NuScenes,
+ sample_token: str,
+ gt_boxes: EvalBoxes,
+ pred_boxes: EvalBoxes,
+ nsweeps: int = 1,
+ conf_th: float = 0.4,
+ pc_range: list = [-30.0, -30.0, -4.0, 30.0, 30.0, 4.0],
+ verbose: bool = True,
+ savepath: str = None,
+ traj_use_perstep_offset: bool = True,
+ data_root='data/nuscenes/',
+ map_pc_range: list = [-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+ padding_value=-10000,
+ map_classes=['divider', 'ped_crossing', 'boundary'],
+ map_fixed_ptsnum_per_line=20,
+ gt_format=['fixed_num_pts'],
+ colors_plt = ['cornflowerblue', 'royalblue', 'slategrey'],
+ pred_data = None) -> None:
+ """
+ Visualizes a sample from BEV with annotations and detection results.
+ :param nusc: NuScenes object.
+ :param sample_token: The nuScenes sample token.
+ :param gt_boxes: Ground truth boxes grouped by sample.
+ :param pred_boxes: Prediction grouped by sample.
+ :param nsweeps: Number of sweeps used for lidar visualization.
+ :param conf_th: The confidence threshold used to filter negatives.
+ :param eval_range: Range in meters beyond which boxes are ignored.
+ :param verbose: Whether to print to stdout.
+ :param savepath: If given, saves the the rendering here instead of displaying.
+ """
+ # Retrieve sensor & pose records.
+ sample_rec = nusc.get('sample', sample_token)
+ sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ # Get boxes.
+ boxes_gt_global = gt_boxes[sample_token]
+ boxes_est_global = pred_boxes[sample_token]
+ # Map GT boxes to lidar.
+ boxes_gt = boxes_to_sensor(boxes_gt_global, pose_record, cs_record)
+ # Map EST boxes to lidar.
+ boxes_est = boxes_to_sensor(boxes_est_global, pose_record, cs_record)
+ # Add scores to EST boxes.
+ for box_est, box_est_global in zip(boxes_est, boxes_est_global):
+ box_est.score = box_est_global.detection_score
+
+ # Init axes.
+ fig, axes = plt.subplots(1, 1, figsize=(4, 4))
+ plt.xlim(xmin=-30, xmax=30)
+ plt.ylim(ymin=-30, ymax=30)
+
+ # Show Pred Map
+ result_dic = pred_data['map_results'][sample_token]['vectors']
+
+ for vector in result_dic:
+ if vector['confidence_level'] < 0.6:
+ continue
+ pred_pts_3d = vector['pts']
+ pred_label_3d = vector['type']
+ pts_x = np.array([pt[0] for pt in pred_pts_3d])
+ pts_y = np.array([pt[1] for pt in pred_pts_3d])
+
+ axes.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=1,alpha=0.8,zorder=-1)
+ axes.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1)
+
+ # ignore_list = ['barrier', 'motorcycle', 'bicycle', 'traffic_cone']
+ ignore_list = ['barrier', 'bicycle', 'traffic_cone']
+
+ # Show Pred boxes.
+ for i, box in enumerate(boxes_est):
+ if box.name in ignore_list:
+ continue
+ # Show only predictions with a high score.
+ assert not np.isnan(box.score), 'Error: Box score cannot be NaN!'
+ if box.score < conf_th or abs(box.center[0]) > 15 or abs(box.center[1]) > 30:
+ continue
+ box.render(axes, view=np.eye(4), colors=('tomato', 'tomato', 'tomato'), linewidth=1, box_idx=None)
+ # if box.name in ['pedestrian']:
+ # continue
+ if traj_use_perstep_offset:
+ mode_idx = [0, 1, 2, 3, 4, 5]
+ box.render_fut_trajs_grad_color(axes, linewidth=1, mode_idx=mode_idx, fut_ts=6, cmap='autumn')
+ else:
+ box.render_fut_trajs_coords(axes, color='tomato', linewidth=1)
+
+ # Show Planning.
+ axes.plot([-0.9, -0.9], [-2, 2], color='mediumseagreen', linewidth=1, alpha=0.8)
+ axes.plot([-0.9, 0.9], [2, 2], color='mediumseagreen', linewidth=1, alpha=0.8)
+ axes.plot([0.9, 0.9], [2, -2], color='mediumseagreen', linewidth=1, alpha=0.8)
+ axes.plot([0.9, -0.9], [-2, -2], color='mediumseagreen', linewidth=1, alpha=0.8)
+ axes.plot([0.0, 0.0], [0.0, 2], color='mediumseagreen', linewidth=1, alpha=0.8)
+ plan_cmd = np.argmax(pred_data['plan_results'][sample_token][1][0,0,0])
+ plan_traj = pred_data['plan_results'][sample_token][0][plan_cmd]
+ plan_traj[abs(plan_traj) < 0.01] = 0.0
+ plan_traj = plan_traj.cumsum(axis=0)
+ plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+ plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+ plan_vecs = None
+ for i in range(plan_traj.shape[0]):
+ plan_vec_i = plan_traj[i]
+ x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+ y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+ xy = np.stack((x_linspace, y_linspace), axis=1)
+ xy = np.stack((xy[:-1], xy[1:]), axis=1)
+ if plan_vecs is None:
+ plan_vecs = xy
+ else:
+ plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+ cmap = 'winter'
+ y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+ colors = color_map(y[:-1], cmap)
+ line_segments = LineCollection(plan_vecs, colors=colors, linewidths=1, linestyles='solid', cmap=cmap)
+ axes.add_collection(line_segments)
+
+ axes.axes.xaxis.set_ticks([])
+ axes.axes.yaxis.set_ticks([])
+ axes.axis('off')
+ fig.set_tight_layout(True)
+ fig.canvas.draw()
+ plt.savefig(savepath+'/bev_pred.png', bbox_inches='tight', dpi=200)
+ plt.close()
+
+
+def obtain_sensor2top(nusc,
+ sensor_token,
+ l2e_t,
+ l2e_r_mat,
+ e2g_t,
+ e2g_r_mat,
+ sensor_type='lidar'):
+ """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+ Args:
+ nusc (class): Dataset class in the nuScenes dataset.
+ sensor_token (str): Sample data token corresponding to the
+ specific sensor type.
+ l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+ l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+ in shape (3, 3).
+ e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+ e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+ in shape (3, 3).
+ sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+ Returns:
+ sweep (dict): Sweep information after transformation.
+ """
+ sd_rec = nusc.get('sample_data', sensor_token)
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+ if os.getcwd() in data_path: # path from lyftdataset is absolute path
+ data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path
+ sweep = {
+ 'data_path': data_path,
+ 'type': sensor_type,
+ 'sample_data_token': sd_rec['token'],
+ 'sensor2ego_translation': cs_record['translation'],
+ 'sensor2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sd_rec['timestamp']
+ }
+
+ l2e_r_s = sweep['sensor2ego_rotation']
+ l2e_t_s = sweep['sensor2ego_translation']
+ e2g_r_s = sweep['ego2global_rotation']
+ e2g_t_s = sweep['ego2global_translation']
+
+ # obtain the RT from sensor to Top LiDAR
+ # sweep->ego->global->ego'->lidar
+ l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+ e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+ R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+ ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+ sensor2lidar_rotation = R.T # points @ R.T + T
+ sensor2lidar_translation = T
+
+ return sensor2lidar_rotation, sensor2lidar_translation
+
+def render_sample_data(
+ sample_toekn: str,
+ with_anns: bool = True,
+ box_vis_level: BoxVisibility = BoxVisibility.ANY,
+ axes_limit: float = 40,
+ ax=None,
+ nsweeps: int = 1,
+ out_path: str = None,
+ out_name: str = None,
+ underlay_map: bool = True,
+ use_flat_vehicle_coordinates: bool = True,
+ show_lidarseg: bool = False,
+ show_lidarseg_legend: bool = False,
+ filter_lidarseg_labels=None,
+ lidarseg_preds_bin_path: str = None,
+ verbose: bool = True,
+ show_panoptic: bool = False,
+ pred_data=None,
+ traj_use_perstep_offset: bool = True
+ ) -> None:
+ """
+ Render sample data onto axis.
+ :param sample_data_token: Sample_data token.
+ :param with_anns: Whether to draw box annotations.
+ :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+ :param axes_limit: Axes limit for lidar and radar (measured in meters).
+ :param ax: Axes onto which to render.
+ :param nsweeps: Number of sweeps for lidar and radar.
+ :param out_path: Optional path to save the rendered figure to disk.
+ :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+ :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+ aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+ can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+ setting is more correct and rotates the plot by ~90 degrees.
+ :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+ to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+ :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+ :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+ or the list is empty, all classes will be displayed.
+ :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+ predictions for the sample.
+ :param verbose: Whether to display the image after it is rendered.
+ :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+ to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+ If show_lidarseg is True, show_panoptic will be set to False.
+ """
+ lidiar_render(sample_toekn, pred_data, out_path=out_path,
+ out_name=out_name, traj_use_perstep_offset=traj_use_perstep_offset)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Visualize VAD predictions')
+ parser.add_argument('--result-path', help='inference result file path')
+ parser.add_argument('--save-path', help='the dir to save visualization results')
+ args = parser.parse_args()
+
+ return args
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ inference_result_path = args.result_path
+ out_path = args.save_path
+ bevformer_results = mmcv.load(inference_result_path)
+ sample_token_list = list(bevformer_results['results'].keys())
+
+ nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+
+ imgs = []
+ fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+ video_path = osp.join(out_path, 'vis.mp4')
+ video = cv2.VideoWriter(video_path, fourcc, 10, (2933, 800), True)
+ for id in tqdm(range(len(sample_token_list))):
+ mmcv.mkdir_or_exist(out_path)
+ render_sample_data(sample_token_list[id],
+ pred_data=bevformer_results,
+ out_path=out_path)
+ pred_path = osp.join(out_path, 'bev_pred.png')
+ pred_img = cv2.imread(pred_path)
+ os.remove(pred_path)
+
+ sample_token = sample_token_list[id]
+ sample = nusc.get('sample', sample_token)
+ # sample = data['results'][sample_token_list[0]][0]
+ cams = [
+ 'CAM_FRONT_LEFT',
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_RIGHT',
+ ]
+
+ cam_imgs = []
+ for cam in cams:
+ sample_data_token = sample['data'][cam]
+ sd_record = nusc.get('sample_data', sample_data_token)
+ sensor_modality = sd_record['sensor_modality']
+ if sensor_modality in ['lidar', 'radar']:
+ assert False
+ elif sensor_modality == 'camera':
+ boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+ name=record['detection_name'], token='predicted') for record in
+ bevformer_results['results'][sample_token]]
+ data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+ box_vis_level=BoxVisibility.ANY,
+ pred_anns=boxes)
+ _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=BoxVisibility.ANY)
+
+ data = Image.open(data_path)
+
+ # Show image.
+ _, ax = plt.subplots(1, 1, figsize=(6, 12))
+ ax.imshow(data)
+
+ if cam == 'CAM_FRONT':
+ lidar_sd_record = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ lidar_cs_record = nusc.get('calibrated_sensor', lidar_sd_record['calibrated_sensor_token'])
+ lidar_pose_record = nusc.get('ego_pose', lidar_sd_record['ego_pose_token'])
+
+ # get plan traj [x,y,z,w] quaternion, w=1
+ # we set z=-1 to get points near the ground in lidar coord system
+ plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+ plan_traj = bevformer_results['plan_results'][sample_token][0][plan_cmd]
+ plan_traj[abs(plan_traj) < 0.01] = 0.0
+ plan_traj = plan_traj.cumsum(axis=0)
+
+ plan_traj = np.concatenate((
+ plan_traj[:, [0]],
+ plan_traj[:, [1]],
+ -1.0*np.ones((plan_traj.shape[0], 1)),
+ np.ones((plan_traj.shape[0], 1)),
+ ), axis=1)
+ # add the start point in lcf
+ plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+ # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+ plan_traj[0, 0] = 0.3
+ plan_traj[0, 2] = -1.0
+ plan_traj[0, 3] = 1.0
+
+ l2e_r = lidar_cs_record['rotation']
+ l2e_t = lidar_cs_record['translation']
+ e2g_r = lidar_pose_record['rotation']
+ e2g_t = lidar_pose_record['translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+ s2l_r, s2l_t = obtain_sensor2top(nusc, sample_data_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam)
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(s2l_r)
+ lidar2cam_t = s2l_t @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ viewpad = np.eye(4)
+ viewpad[:camera_intrinsic.shape[0], :camera_intrinsic.shape[1]] = camera_intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ plan_traj = lidar2img_rt @ plan_traj.T
+ plan_traj = plan_traj[0:2, ...] / np.maximum(
+ plan_traj[2:3, ...], np.ones_like(plan_traj[2:3, ...]) * 1e-5)
+ plan_traj = plan_traj.T
+ plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+ plan_vecs = None
+ for i in range(plan_traj.shape[0]):
+ plan_vec_i = plan_traj[i]
+ x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+ y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+ xy = np.stack((x_linspace, y_linspace), axis=1)
+ xy = np.stack((xy[:-1], xy[1:]), axis=1)
+ if plan_vecs is None:
+ plan_vecs = xy
+ else:
+ plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+ cmap = 'winter'
+ y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+ colors = color_map(y[:-1], cmap)
+ line_segments = LineCollection(plan_vecs, colors=colors, linewidths=2, linestyles='solid', cmap=cmap)
+ ax.add_collection(line_segments)
+
+ ax.set_xlim(0, data.size[0])
+ ax.set_ylim(data.size[1], 0)
+ ax.axis('off')
+ if out_path is not None:
+ savepath = osp.join(out_path, f'{cam}_PRED')
+ plt.savefig(savepath, bbox_inches='tight', dpi=200, pad_inches=0.0)
+ plt.close()
+
+ # Load boxes and image.
+ data_path = osp.join(out_path, f'{cam}_PRED.png')
+ cam_img = cv2.imread(data_path)
+ lw = 6
+ tf = max(lw - 3, 1)
+ w, h = cv2.getTextSize(cam, 0, fontScale=lw / 6, thickness=tf)[0] # text width, height
+ # color=(0, 0, 0)
+ txt_color=(255, 255, 255)
+ cv2.putText(cam_img,
+ cam, (10, h + 10),
+ 0,
+ lw / 6,
+ txt_color,
+ thickness=tf,
+ lineType=cv2.LINE_AA)
+ cam_imgs.append(cam_img)
+ else:
+ raise ValueError("Error: Unknown sensor modality!")
+
+ plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+ cmd_list = ['Turn Right', 'Turn Left', 'Go Straight']
+ plan_cmd_str = cmd_list[plan_cmd]
+ pred_img = cv2.copyMakeBorder(pred_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+ # font
+ font = cv2.FONT_HERSHEY_SIMPLEX
+ # fontScale
+ fontScale = 1
+ # Line thickness of 2 px
+ thickness = 3
+ # org
+ org = (20, 40)
+ # Blue color in BGR
+ color = (0, 0, 0)
+ # Using cv2.putText() method
+ pred_img = cv2.putText(pred_img, 'BEV', org, font,
+ fontScale, color, thickness, cv2.LINE_AA)
+ pred_img = cv2.putText(pred_img, plan_cmd_str, (20, 770), font,
+ fontScale, color, thickness, cv2.LINE_AA)
+
+ sample_img = pred_img
+ cam_img_top = cv2.hconcat([cam_imgs[0], cam_imgs[1], cam_imgs[2]])
+ cam_img_down = cv2.hconcat([cam_imgs[3], cam_imgs[4], cam_imgs[5]])
+ cam_img = cv2.vconcat([cam_img_top, cam_img_down])
+ size = (2133, 800)
+ cam_img = cv2.resize(cam_img, size)
+ vis_img = cv2.hconcat([cam_img, sample_img])
+
+ video.write(vis_img)
+
+ video.release()
+ cv2.destroyAllWindows()
diff --git a/adzoo/vad/apis/__init__.py b/adzoo/vad/apis/__init__.py
new file mode 100644
index 0000000..15dff22
--- /dev/null
+++ b/adzoo/vad/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/adzoo/vad/apis/mmdet_train.py b/adzoo/vad/apis/mmdet_train.py
new file mode 100644
index 0000000..687b989
--- /dev/null
+++ b/adzoo/vad/apis/mmdet_train.py
@@ -0,0 +1,196 @@
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+ Fp16OptimizerHook, OptimizerHook,
+ build_runner)
+from mmcv.utils import build_from_cfg
+from mmcv.optims import build_optimizer
+from mmcv.core import EvalHook
+
+from mmcv.datasets import (build_dataset,
+ replace_ImageToTensor)
+from mmcv.utils import get_root_logger
+import time
+import os.path as osp
+from mmcv.datasets.builder import build_dataloader
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+from mmcv.datasets.builder import custom_build_dataset
+def custom_train_detector(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ eval_model=None,
+ meta=None):
+ logger = get_root_logger(cfg.log_level)
+
+ # prepare data loaders
+
+ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+ #assert len(dataset)==1s
+ if 'imgs_per_gpu' in cfg.data:
+ logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+ 'Please use "samples_per_gpu" instead')
+ if 'samples_per_gpu' in cfg.data:
+ logger.warning(
+ f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+ f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+ f'={cfg.data.imgs_per_gpu} is used in this experiments')
+ else:
+ logger.warning(
+ 'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+ f'{cfg.data.imgs_per_gpu} in this experiments')
+ cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+ data_loaders = [
+ build_dataloader(
+ ds,
+ cfg.data.samples_per_gpu,
+ cfg.data.workers_per_gpu,
+ # cfg.gpus will be ignored if distributed
+ len(cfg.gpu_ids),
+ dist=distributed,
+ seed=cfg.seed,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ ) for ds in dataset
+ ]
+
+ # put model on gpus
+ if distributed:
+ find_unused_parameters = cfg.get('find_unused_parameters', False)
+ # Sets the `find_unused_parameters` parameter in
+ # torch.nn.parallel.DistributedDataParallel
+ model = DistributedDataParallel(
+ model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters)
+ if eval_model is not None:
+ eval_model = DistributedDataParallel(
+ eval_model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False,
+ find_unused_parameters=find_unused_parameters)
+ else:
+ model = DataParallel(
+ model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+ if eval_model is not None:
+ eval_model = DataParallel(
+ eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+
+ # build runner
+ optimizer = build_optimizer(model, cfg.optimizer)
+
+ if 'runner' not in cfg:
+ cfg.runner = {
+ 'type': 'EpochBasedRunner',
+ 'max_epochs': cfg.total_epochs
+ }
+ warnings.warn(
+ 'config is now expected to have a `runner` section, '
+ 'please set `runner` in your config.', UserWarning)
+ else:
+ if 'total_epochs' in cfg:
+ assert cfg.total_epochs == cfg.runner.max_epochs
+ if eval_model is not None:
+ runner = build_runner(
+ cfg.runner,
+ default_args=dict(
+ model=model,
+ eval_model=eval_model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+ else:
+ runner = build_runner(
+ cfg.runner,
+ default_args=dict(
+ model=model,
+ optimizer=optimizer,
+ work_dir=cfg.work_dir,
+ logger=logger,
+ meta=meta))
+
+ # an ugly workaround to make .log and .log.json filenames the same
+ runner.timestamp = timestamp
+
+ # fp16 setting
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ optimizer_config = Fp16OptimizerHook(
+ **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+ elif distributed and 'type' not in cfg.optimizer_config:
+ optimizer_config = OptimizerHook(**cfg.optimizer_config)
+ else:
+ optimizer_config = cfg.optimizer_config
+
+ # register hooks
+ runner.register_training_hooks(cfg.lr_config, optimizer_config,
+ cfg.checkpoint_config, cfg.log_config,
+ cfg.get('momentum_config', None))
+
+ # register profiler hook
+ #trace_config = dict(type='tb_trace', dir_name='work_dir')
+ #profiler_config = dict(on_trace_ready=trace_config)
+ #runner.register_profiler_hook(profiler_config)
+
+ if distributed:
+ if isinstance(runner, EpochBasedRunner):
+ runner.register_hook(DistSamplerSeedHook())
+
+ # register eval hooks
+ if validate:
+ # Support batch_size > 1 in validation
+ val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+ if val_samples_per_gpu > 1:
+ assert False
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.val.pipeline = replace_ImageToTensor(
+ cfg.data.val.pipeline)
+ val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
+
+ val_dataloader = build_dataloader(
+ val_dataset,
+ samples_per_gpu=val_samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'),
+ )
+ eval_cfg = cfg.get('evaluation', {})
+ eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+ eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+ eval_hook = CustomDistEvalHook if distributed else EvalHook
+ runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+ # user-defined hooks
+ if cfg.get('custom_hooks', None):
+ custom_hooks = cfg.custom_hooks
+ assert isinstance(custom_hooks, list), \
+ f'custom_hooks expect list type, but got {type(custom_hooks)}'
+ for hook_cfg in cfg.custom_hooks:
+ assert isinstance(hook_cfg, dict), \
+ 'Each item in custom_hooks expects dict type, but got ' \
+ f'{type(hook_cfg)}'
+ hook_cfg = hook_cfg.copy()
+ priority = hook_cfg.pop('priority', 'NORMAL')
+ hook = build_from_cfg(hook_cfg, HOOKS)
+ runner.register_hook(hook, priority=priority)
+
+ if cfg.resume_from:
+ runner.resume(cfg.resume_from)
+ elif cfg.load_from:
+ runner.load_checkpoint(cfg.load_from)
+ runner.run(data_loaders, cfg.workflow)
+
diff --git a/adzoo/vad/apis/test.py b/adzoo/vad/apis/test.py
new file mode 100644
index 0000000..3d31abb
--- /dev/null
+++ b/adzoo/vad/apis/test.py
@@ -0,0 +1,215 @@
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.utils import get_dist_info
+
+from mmcv.core import encode_mask_results
+from mmcv.fileio.io import dump, load
+from mmcv.utils import mkdir_or_exist, ProgressBar
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+ """Encode bitmap mask to RLE code. Semantic Masks only
+ Args:
+ mask_results (list | tuple[list]): bitmap mask results.
+ In mask scoring rcnn, mask_results is a tuple of (segm_results,
+ segm_cls_score).
+ Returns:
+ list | tuple: RLE encoded mask.
+ """
+ cls_segms = mask_results
+ num_classes = len(cls_segms)
+ encoded_mask_results = []
+ for i in range(len(cls_segms)):
+ encoded_mask_results.append(
+ mask_util.encode(
+ np.array(
+ cls_segms[i][:, :, np.newaxis], order='F',
+ dtype='uint8'))[0]) # encoded with RLE
+ return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+ """Test model with multiple gpus.
+ This method tests model with multiple gpus and collects the results
+ under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+ it encodes results to gpu tensors and use gpu communication for results
+ collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+ and collects them by the rank 0 worker.
+ Args:
+ model (nn.Module): Model to be tested.
+ data_loader (nn.Dataloader): Pytorch data loader.
+ tmpdir (str): Path of directory to save the temporary results from
+ different gpus under cpu mode.
+ gpu_collect (bool): Option to use either gpu or cpu to collect results.
+ Returns:
+ list: The prediction results.
+ """
+ model.eval()
+ bbox_results = []
+ mask_results = []
+ dataset = data_loader.dataset
+ rank, world_size = get_dist_info()
+ if rank == 0:
+ prog_bar = ProgressBar(len(dataset))
+ time.sleep(2) # This line can prevent deadlock problem in some cases.
+ have_mask = False
+ for i, data in enumerate(data_loader):
+ with torch.no_grad():
+ result = model(data,return_loss=False, rescale=True)
+ # encode mask results
+ if isinstance(result, dict):
+ if 'bbox_results' in result.keys():
+ bbox_result = result['bbox_results']
+ batch_size = len(result['bbox_results'])
+ bbox_results.extend(bbox_result)
+ if 'mask_results' in result.keys() and result['mask_results'] is not None:
+ mask_result = custom_encode_mask_results(result['mask_results'])
+ mask_results.extend(mask_result)
+ have_mask = True
+ else:
+ batch_size = len(result)
+ bbox_results.extend(result)
+
+ if i>150:
+ break
+
+ #if isinstance(result[0], tuple):
+ # assert False, 'this code is for instance segmentation, which our code will not utilize.'
+ # result = [(bbox_results, encode_mask_results(mask_results))
+ # for bbox_results, mask_results in result]
+ if rank == 0:
+
+ for _ in range(batch_size * world_size):
+ prog_bar.update()
+
+ # collect results from all ranks
+ if gpu_collect:
+ bbox_results = collect_results_gpu(bbox_results, len(dataset))
+ if have_mask:
+ mask_results = collect_results_gpu(mask_results, len(dataset))
+ else:
+ mask_results = None
+ else:
+ bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+ tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+ if have_mask:
+ mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+ else:
+ mask_results = None
+
+ return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+ rank, world_size = get_dist_info()
+ # create a tmp dir if it is not specified
+ if tmpdir is None:
+ MAX_LEN = 512
+ # 32 is whitespace
+ dir_tensor = torch.full((MAX_LEN, ),
+ 32,
+ dtype=torch.uint8,
+ device='cuda')
+ if rank == 0:
+ mkdir_or_exist('.dist_test')
+ tmpdir = tempfile.mkdtemp(dir='.dist_test')
+ tmpdir = torch.tensor(
+ bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+ dir_tensor[:len(tmpdir)] = tmpdir
+ dist.broadcast(dir_tensor, 0)
+ tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+ else:
+ mkdir_or_exist(tmpdir)
+ # dump the part result to the dir
+ dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+ dist.barrier()
+ # collect all parts
+ if rank != 0:
+ return None
+ else:
+ # load results of all parts from tmp dir
+ part_list = []
+ for i in range(world_size):
+ part_file = osp.join(tmpdir, f'part_{i}.pkl')
+ part_list.append(load(part_file))
+ # sort the results
+ ordered_results = []
+ '''
+ bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+ '''
+ #for res in zip(*part_list):
+ for res in part_list:
+ ordered_results.extend(list(res))
+ # the dataloader may pad some samples
+ ordered_results = ordered_results[:size]
+ # remove tmp dir
+ shutil.rmtree(tmpdir)
+ return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+ collect_results_cpu(result_part, size)
+
+
+def single_gpu_test(model, data_loader):
+ """Test model with single gpu.
+
+ This method tests model with single gpu and gives the 'show' option.
+ By setting ``show=True``, it saves the visualization results under
+ ``out_dir``.
+
+ Args:
+ model (nn.Module): Model to be tested.
+ data_loader (nn.Dataloader): Pytorch data loader.
+ show (bool, optional): Whether to save viualization results.
+ Default: True.
+ out_dir (str, optional): The path to save visualization results.
+ Default: None.
+
+ Returns:
+ list[dict]: The prediction results.
+ """
+ model.eval()
+ bbox_results = []
+ mask_results = []
+ dataset = data_loader.dataset
+ prog_bar = ProgressBar(len(dataset))
+ time.sleep(2) # This line can prevent deadlock problem in some cases.
+ have_mask = False
+
+ for i, data in enumerate(data_loader):
+ with torch.no_grad():
+ result = model(data,return_loss=False, rescale=True)
+ batch_size = len(result['bbox_results'])
+
+ # encode mask results
+ if isinstance(result, dict):
+ if 'bbox_results' in result.keys():
+ bbox_result = result['bbox_results']
+ batch_size = len(result['bbox_results'])
+ bbox_results.extend(bbox_result)
+ if 'mask_results' in result.keys() and result['mask_results'] is not None:
+ mask_result = custom_encode_mask_results(result['mask_results'])
+ mask_results.extend(mask_result)
+ have_mask = True
+ else:
+ batch_size = len(result)
+ bbox_results.extend(result)
+
+ if isinstance(result[0], tuple):
+ assert False, 'this code is for instance segmentation, which our code will not utilize.'
+ result = [(bbox_results, encode_mask_results(mask_results))
+ for bbox_results, mask_results in result]
+
+ for _ in range(batch_size):
+ prog_bar.update()
+
+ return {'bbox_results': bbox_results, 'mask_results': mask_results}
diff --git a/adzoo/vad/apis/train.py b/adzoo/vad/apis/train.py
new file mode 100644
index 0000000..049cd5c
--- /dev/null
+++ b/adzoo/vad/apis/train.py
@@ -0,0 +1,60 @@
+from .mmdet_train import custom_train_detector
+
+
+def custom_train_model(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ eval_model=None,
+ meta=None):
+ """A function wrapper for launching model training according to cfg.
+
+ Because we need different eval_hook in runner. Should be deprecated in the
+ future.
+ """
+ if cfg.model.type in ['EncoderDecoder3D']:
+ assert False
+ else:
+ custom_train_detector(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ eval_model=eval_model,
+ meta=meta)
+
+
+def train_model(model,
+ dataset,
+ cfg,
+ distributed=False,
+ validate=False,
+ timestamp=None,
+ meta=None):
+ """A function wrapper for launching model training according to cfg.
+
+ Because we need different eval_hook in runner. Should be deprecated in the
+ future.
+ """
+ if cfg.model.type in ['EncoderDecoder3D']:
+ train_segmentor(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ meta=meta)
+ else:
+ train_detector(
+ model,
+ dataset,
+ cfg,
+ distributed=distributed,
+ validate=validate,
+ timestamp=timestamp,
+ meta=meta)
diff --git a/adzoo/vad/configs/VAD/VAD_base_e2e.py b/adzoo/vad/configs/VAD/VAD_base_e2e.py
new file mode 100644
index 0000000..10319db
--- /dev/null
+++ b/adzoo/vad/configs/VAD/VAD_base_e2e.py
@@ -0,0 +1,438 @@
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+#
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing', 'boundary']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+ type='VAD',
+ use_grid_mask=True,
+ video_test_mode=True,
+ pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='VADHead',
+ map_thresh=0.5,
+ dis_thresh=0.2,
+ pe_normalization=True,
+ tot_epoch=total_epochs,
+ use_traj_lr_warmup=False,
+ query_thresh=0.0,
+ query_use_fix_pad=False,
+ ego_his_encoder=None,
+ ego_lcf_feat_idx=None,
+ valid_fut_ts=6,
+ ego_agent_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ ego_map_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ motion_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ motion_map_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ use_pe=True,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=300,
+ num_classes=num_classes,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ map_num_vec=map_num_vec,
+ map_num_classes=map_num_classes,
+ map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+ map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+ map_query_embed_type='instance_pts',
+ map_transform_method='minmax',
+ map_gt_shift_pts_pattern='v2',
+ map_dir_interval=1,
+ map_code_size=2,
+ map_code_weights=[1.0, 1.0, 1.0, 1.0],
+ transformer=dict(
+ type='VADPerceptionTransformer',
+ map_num_vec=map_num_vec,
+ map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ map_decoder=dict(
+ type='MapDetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='CustomNMSFreeCoder',
+ post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+ pc_range=point_cloud_range,
+ max_num=100,
+ voxel_size=voxel_size,
+ num_classes=num_classes),
+ map_bbox_coder=dict(
+ type='MapNMSFreeCoder',
+ post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+ pc_range=point_cloud_range,
+ max_num=50,
+ voxel_size=voxel_size,
+ num_classes=map_num_classes),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_traj=dict(type='L1Loss', loss_weight=0.2),
+ loss_traj_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=0.2),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+ loss_map_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+ loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+ loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+ loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+ loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+ loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+ loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+ loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range),
+ map_assigner=dict(
+ type='MapHungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+ pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+ pc_range=point_cloud_range))))
+
+dataset_type = 'VADCustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+ dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='VADObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='VADFormatBundle3D', class_names=class_names, with_ego=True),
+ dict(type='CustomCollect3D',\
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs',
+ 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat', 'gt_attr_labels'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+ dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='VADObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ # dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+ dict(type='CustomCollect3D',\
+ keys=['points', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+ 'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+ 'ego_lcf_feat', 'gt_attr_labels'])])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'vad_nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ bev_size=(bev_h_, bev_w_),
+ pc_range=point_cloud_range,
+ queue_length=queue_length,
+ map_classes=map_classes,
+ map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+ map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ custom_eval_version='vad_nusc_detection_cvpr_2019'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ pc_range=point_cloud_range,
+ ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1,
+ map_classes=map_classes,
+ map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+ map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+ map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+ use_pkl_result=True,
+ custom_eval_version='vad_nusc_detection_cvpr_2019'),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ pc_range=point_cloud_range,
+ ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1,
+ map_classes=map_classes,
+ map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+ map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+ map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+ use_pkl_result=True,
+ custom_eval_version='vad_nusc_detection_cvpr_2019'),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+ interval=1,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+# fp16 = dict(loss_scale=512.)
+# find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py b/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py
new file mode 100644
index 0000000..8d59fa0
--- /dev/null
+++ b/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py
@@ -0,0 +1,568 @@
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+
+NameMapping = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.gazelle.omafiets": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "vehicle.audi.etron": 'car',
+ "vehicle.ford.crown": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.tesla.model3": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "vehicle.ford.ambulance": "van",
+ # truck
+ "vehicle.carlamotors.firetruck": 'truck',
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'traffic_sign',
+ "traffic.speed_limit.40": 'traffic_sign',
+ "traffic.speed_limit.50": 'traffic_sign',
+ "traffic.speed_limit.60": 'traffic_sign',
+ "traffic.speed_limit.90": 'traffic_sign',
+ "traffic.speed_limit.120": 'traffic_sign',
+
+ "traffic.stop": 'traffic_sign',
+ "traffic.yield": 'traffic_sign',
+ "traffic.traffic_light": 'traffic_light',
+ #=========================================
+
+ #===================Construction===========
+ "static.prop.warningconstruction" : 'traffic_cone',
+ "static.prop.warningaccident": 'traffic_cone',
+ "static.prop.trafficwarning": "traffic_cone",
+
+ #===================Construction===========
+ "static.prop.constructioncone": 'traffic_cone',
+
+ #=================pedestrian==============
+ "walker.pedestrian.0001": 'pedestrian',
+ "walker.pedestrian.0004": 'pedestrian',
+ "walker.pedestrian.0005": 'pedestrian',
+ "walker.pedestrian.0007": 'pedestrian',
+ "walker.pedestrian.0013": 'pedestrian',
+ "walker.pedestrian.0014": 'pedestrian',
+ "walker.pedestrian.0017": 'pedestrian',
+ "walker.pedestrian.0018": 'pedestrian',
+ "walker.pedestrian.0019": 'pedestrian',
+ "walker.pedestrian.0020": 'pedestrian',
+ "walker.pedestrian.0022": 'pedestrian',
+ "walker.pedestrian.0025": 'pedestrian',
+ "walker.pedestrian.0035": 'pedestrian',
+ "walker.pedestrian.0041": 'pedestrian',
+ "walker.pedestrian.0046": 'pedestrian',
+ "walker.pedestrian.0047": 'pedestrian',
+
+ # ==========================================
+ "static.prop.dirtdebris01": 'others',
+ "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+ "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+ }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['Broken','Solid','SolidSolid','Center','TrafficLight','StopSign']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+past_frames = 2
+future_frames = 6
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+ type='VAD',
+ use_grid_mask=True,
+ video_test_mode=True,
+ pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='VADHead',
+ map_thresh=0.5,
+ dis_thresh=0.2,
+ pe_normalization=True,
+ tot_epoch=total_epochs,
+ use_traj_lr_warmup=False,
+ query_thresh=0.0,
+ query_use_fix_pad=False,
+ ego_his_encoder=None,
+ ego_lcf_feat_idx=None,
+ valid_fut_ts=6,
+ ego_fut_mode=6,
+ ego_agent_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ ego_map_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ motion_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ motion_map_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ use_pe=True,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=300,
+ num_classes=num_classes,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ map_num_vec=map_num_vec,
+ map_num_classes=map_num_classes,
+ map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+ map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+ map_query_embed_type='instance_pts',
+ map_transform_method='minmax',
+ map_gt_shift_pts_pattern='v2',
+ map_dir_interval=1,
+ map_code_size=2,
+ map_code_weights=[1.0, 1.0, 1.0, 1.0],
+ transformer=dict(
+ type='VADPerceptionTransformer',
+ map_num_vec=map_num_vec,
+ map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=6,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ map_decoder=dict(
+ type='MapDetectionTransformerDecoder',
+ num_layers=6,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='CustomNMSFreeCoder',
+ post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+ pc_range=point_cloud_range,
+ max_num=100,
+ voxel_size=voxel_size,
+ num_classes=num_classes),
+ map_bbox_coder=dict(
+ type='MapNMSFreeCoder',
+ post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+ pc_range=point_cloud_range,
+ max_num=50,
+ voxel_size=voxel_size,
+ num_classes=map_num_classes),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_traj=dict(type='L1Loss', loss_weight=0.2),
+ loss_traj_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=0.2),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+ loss_map_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+ loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+ loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+ loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+ loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+ loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+ loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+ loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range),
+ map_assigner=dict(
+ type='MapHungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+ pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+ pc_range=point_cloud_range))))
+
+dataset_type = "B2D_VAD_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+ dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='VADObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='VADFormatBundle3D', class_names=class_names, with_ego=True),
+ dict(type='CustomCollect3D',\
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs','gt_attr_labels','ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+ dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='VADObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ # dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+ dict(type='CustomCollect3D',\
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+ 'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+ 'ego_lcf_feat','gt_attr_labels'])])
+]
+
+inference_only_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+ dict(type='CustomCollect3D', keys=[ 'img', 'ego_fut_cmd'])])
+]
+
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=6,
+ train=dict(
+
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ past_frames=past_frames,
+ future_frames=future_frames,
+ point_cloud_range=point_cloud_range,
+ polyline_points_num=map_fixed_ptsnum_per_gt_line,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ #custom_eval_version='vad_nusc_detection_cvpr_2019'
+ ),
+ val=dict(type=dataset_type,
+
+ data_root=data_root,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ past_frames=past_frames,
+ future_frames=future_frames,
+ point_cloud_range=point_cloud_range,
+ polyline_points_num=map_fixed_ptsnum_per_gt_line,
+ #use_pkl_result=True,
+ #custom_eval_version='vad_nusc_detection_cvpr_2019'
+ ),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ ann_file=ann_file_val,
+ pipeline=test_pipeline,
+ classes=class_names,
+ name_mapping=NameMapping,
+ map_root=map_root,
+ map_file=map_file,
+ modality=input_modality,
+ bev_size=(bev_h_, bev_w_),
+ queue_length=queue_length,
+ past_frames=past_frames,
+ future_frames=future_frames,
+ point_cloud_range=point_cloud_range,
+ polyline_points_num=map_fixed_ptsnum_per_gt_line,
+ eval_cfg=eval_cfg
+ ),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+ interval=1,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+# fp16 = dict(loss_scale=512.)
+# find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/adzoo/vad/configs/VAD/VAD_tiny_e2e.py b/adzoo/vad/configs/VAD/VAD_tiny_e2e.py
new file mode 100644
index 0000000..67e088a
--- /dev/null
+++ b/adzoo/vad/configs/VAD/VAD_tiny_e2e.py
@@ -0,0 +1,454 @@
+_base_ = [
+ '../datasets/custom_nus-3d.py',
+ '../_base_/default_runtime.py'
+]
+
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing', 'boundary']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 100
+bev_w_ = 100
+queue_length = 3 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+ type='VAD',
+ use_grid_mask=True,
+ video_test_mode=True,
+ pretrained=dict(img='torchvision://resnet50'),
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(3,),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='pytorch'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=_num_levels_,
+ relu_before_extra_convs=True),
+ pts_bbox_head=dict(
+ type='VADHead',
+ map_thresh=0.5,
+ dis_thresh=0.2,
+ pe_normalization=True,
+ tot_epoch=total_epochs,
+ use_traj_lr_warmup=False,
+ query_thresh=0.0,
+ query_use_fix_pad=False,
+ ego_his_encoder=None,
+ ego_lcf_feat_idx=None,
+ valid_fut_ts=6,
+ ego_agent_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ ego_map_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ motion_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ motion_map_decoder=dict(
+ type='CustomTransformerDecoder',
+ num_layers=1,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+ use_pe=True,
+ bev_h=bev_h_,
+ bev_w=bev_w_,
+ num_query=300,
+ num_classes=num_classes,
+ in_channels=_dim_,
+ sync_cls_avg_factor=True,
+ with_box_refine=True,
+ as_two_stage=False,
+ map_num_vec=map_num_vec,
+ map_num_classes=map_num_classes,
+ map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+ map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+ map_query_embed_type='instance_pts',
+ map_transform_method='minmax',
+ map_gt_shift_pts_pattern='v2',
+ map_dir_interval=1,
+ map_code_size=2,
+ map_code_weights=[1.0, 1.0, 1.0, 1.0],
+ transformer=dict(
+ type='VADPerceptionTransformer',
+ map_num_vec=map_num_vec,
+ map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+ rotate_prev_bev=True,
+ use_shift=True,
+ use_can_bus=True,
+ embed_dims=_dim_,
+ encoder=dict(
+ type='BEVFormerEncoder',
+ num_layers=3,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=4,
+ return_intermediate=False,
+ transformerlayers=dict(
+ type='BEVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TemporalSelfAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ dict(
+ type='SpatialCrossAttention',
+ pc_range=point_cloud_range,
+ deformable_attention=dict(
+ type='MSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_points=8,
+ num_levels=_num_levels_),
+ embed_dims=_dim_,
+ )
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ decoder=dict(
+ type='DetectionTransformerDecoder',
+ num_layers=3,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm'))),
+ map_decoder=dict(
+ type='MapDetectionTransformerDecoder',
+ num_layers=3,
+ return_intermediate=True,
+ transformerlayers=dict(
+ type='DetrTransformerDecoderLayer',
+ attn_cfgs=[
+ dict(
+ type='MultiheadAttention',
+ embed_dims=_dim_,
+ num_heads=8,
+ dropout=0.1),
+ dict(
+ type='CustomMSDeformableAttention',
+ embed_dims=_dim_,
+ num_levels=1),
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+ 'ffn', 'norm')))),
+ bbox_coder=dict(
+ type='CustomNMSFreeCoder',
+ post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+ pc_range=point_cloud_range,
+ max_num=100,
+ voxel_size=voxel_size,
+ num_classes=num_classes),
+ map_bbox_coder=dict(
+ type='MapNMSFreeCoder',
+ post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+ pc_range=point_cloud_range,
+ max_num=50,
+ voxel_size=voxel_size,
+ num_classes=map_num_classes),
+ positional_encoding=dict(
+ type='LearnedPositionalEncoding',
+ num_feats=_pos_dim_,
+ row_num_embed=bev_h_,
+ col_num_embed=bev_w_,
+ ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+ loss_traj=dict(type='L1Loss', loss_weight=0.2),
+ loss_traj_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=0.2),
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+ loss_map_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=2.0),
+ loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+ loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+ loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+ loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+ loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+ loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+ loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+ loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5)),
+ # model training and testing settings
+ train_cfg=dict(pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ point_cloud_range=point_cloud_range,
+ out_size_factor=4,
+ assigner=dict(
+ type='HungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+ iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+ pc_range=point_cloud_range),
+ map_assigner=dict(
+ type='MapHungarianAssigner3D',
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
+ reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+ pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+ pc_range=point_cloud_range))))
+
+dataset_type = 'VADCustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+ dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='CustomObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.4]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_ego=True),
+ dict(type='CustomCollect3D',\
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs',
+ 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat', 'gt_attr_labels'])
+]
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+ dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='CustomObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ # dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.4]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+ dict(type='CustomCollect3D',\
+ keys=['points', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+ 'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+ 'ego_lcf_feat', 'gt_attr_labels'])])
+]
+
+inference_only_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1600, 900),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+ dict(type='CustomCollect3D', keys=[ 'img', 'ego_fut_cmd'])])
+]
+
+data = dict(
+ samples_per_gpu=1,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'vad_nuscenes_infos_temporal_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ use_valid_flag=True,
+ bev_size=(bev_h_, bev_w_),
+ pc_range=point_cloud_range,
+ queue_length=queue_length,
+ map_classes=map_classes,
+ map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+ map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ custom_eval_version='vad_nusc_detection_cvpr_2019'),
+ val=dict(type=dataset_type,
+ data_root=data_root,
+ pc_range=point_cloud_range,
+ ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1,
+ map_classes=map_classes,
+ map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+ map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+ map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+ use_pkl_result=True,
+ custom_eval_version='vad_nusc_detection_cvpr_2019'),
+ test=dict(type=dataset_type,
+ data_root=data_root,
+ pc_range=point_cloud_range,
+ ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+ pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+ classes=class_names, modality=input_modality, samples_per_gpu=1,
+ map_classes=map_classes,
+ map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+ map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+ map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+ use_pkl_result=True,
+ custom_eval_version='vad_nusc_detection_cvpr_2019'),
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
+ nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+ type='AdamW',
+ lr=2e-4,
+ paramwise_cfg=dict(
+ custom_keys={
+ 'img_backbone': dict(lr_mult=0.1),
+ }),
+ weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=1.0 / 3,
+ min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+ interval=100,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+# fp16 = dict(loss_scale=512.)
+# find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/adzoo/vad/configs/_base_/datasets/coco_instance.py b/adzoo/vad/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000..f6ea4f4
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+ dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 800),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/instances_train2017.json',
+ img_prefix=data_root + 'train2017/',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/instances_val2017.json',
+ img_prefix=data_root + 'val2017/',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/instances_val2017.json',
+ img_prefix=data_root + 'val2017/',
+ pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/vad/configs/_base_/datasets/kitti-3d-3class.py b/adzoo/vad/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000..1822af4
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,140 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'kitti_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(
+ filter_by_difficulty=[-1],
+ filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+ classes=class_names,
+ sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='ObjectNoise',
+ num_try=100,
+ translation_std=[1.0, 1.0, 0.5],
+ global_rot_range=[0.0, 0.0],
+ rot_range=[-0.78539816, 0.78539816]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=6,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_train.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/kitti-3d-car.py b/adzoo/vad/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000..1e81226
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,138 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'kitti_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+ classes=class_names,
+ sample_groups=dict(Car=15))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='ObjectNoise',
+ num_try=100,
+ translation_std=[1.0, 1.0, 0.5],
+ global_rot_range=[0.0, 0.0],
+ rot_range=[-0.78539816, 0.78539816]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=6,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_train.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'kitti_infos_val.pkl',
+ split='training',
+ pts_prefix='velodyne_reduced',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/lyft-3d.py b/adzoo/vad/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000..71baff0
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+ 'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/lyft/': 's3://lyft/lyft/',
+# 'data/lyft/': 's3://lyft/lyft/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_test.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/nuim_instance.py b/adzoo/vad/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000..82fce56
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,59 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+ dict(
+ type='Resize',
+ img_scale=[(1280, 720), (1920, 1080)],
+ multiscale_mode='range',
+ keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1600, 900),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/vad/configs/_base_/datasets/nus-3d.py b/adzoo/vad/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000..1548171
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/nus-mono3d.py b/adzoo/vad/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000..1363a94
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'CustomNuScenesMonoDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=False,
+ use_camera=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox=True,
+ with_label=True,
+ with_attr_label=True,
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_bbox_depth=True),
+ dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='Collect3D',
+ keys=[
+ 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+ 'gt_labels_3d', 'centers2d', 'depths'
+ ]),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='MultiScaleFlipAug',
+ scale_factor=1.0,
+ flip=False,
+ transforms=[
+ dict(type='RandomFlip3D'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['img']),
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['img'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=train_pipeline,
+ modality=input_modality,
+ test_mode=False,
+ box_type_3d='Camera'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='Camera'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+ img_prefix=data_root,
+ classes=class_names,
+ pipeline=test_pipeline,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='Camera'))
+evaluation = dict(interval=2)
diff --git a/adzoo/vad/configs/_base_/datasets/range100_lyft-3d.py b/adzoo/vad/configs/_base_/datasets/range100_lyft-3d.py
new file mode 100644
index 0000000..efa63ea
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/range100_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+ 'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/lyft/': 's3://lyft/lyft/',
+# 'data/lyft/': 's3://lyft/lyft/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_test.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/s3dis-3d-5class.py b/adzoo/vad/configs/_base_/datasets/s3dis-3d-5class.py
new file mode 100644
index 0000000..2422766
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/s3dis-3d-5class.py
@@ -0,0 +1,114 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = './data/s3dis/'
+class_names = ('table', 'chair', 'sofa', 'bookcase', 'board')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ # following ScanNet dataset the rotation range is 5 degrees
+ rot_range=[-0.087266, 0.087266],
+ scale_ratio_range=[1.0, 1.0],
+ shift_height=True),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type='ConcatDataset',
+ datasets=[
+ dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + f's3dis_infos_Area_{i}.pkl',
+ pipeline=train_pipeline,
+ filter_empty_gt=False,
+ classes=class_names,
+ box_type_3d='Depth') for i in train_area
+ ],
+ separate_eval=False)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/s3dis_seg-3d-13class.py b/adzoo/vad/configs/_base_/datasets/s3dis_seg-3d-13class.py
new file mode 100644
index 0000000..39bf556
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/s3dis_seg-3d-13class.py
@@ -0,0 +1,139 @@
+# dataset settings
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+ 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=tuple(range(len(class_names))),
+ max_cat_id=13),
+ dict(
+ type='IndoorPatchPointSample',
+ num_points=num_points,
+ block_size=1.0,
+ ignore_index=len(class_names),
+ use_normalized_coord=True,
+ enlarge_size=0.2,
+ min_unique_num=None),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(
+ # a wrapper in order to successfully call test function
+ # actually we don't perform test-time-aug
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.0,
+ flip_ratio_bev_vertical=0.0),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=tuple(range(len(class_names))),
+ max_cat_id=13),
+ dict(
+ type='DefaultFormatBundle3D',
+ with_label=False,
+ class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ # train on area 1, 2, 3, 4, 6
+ # test on area 5
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_files=[
+ data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
+ ],
+ pipeline=train_pipeline,
+ classes=class_names,
+ test_mode=False,
+ ignore_index=len(class_names),
+ scene_idxs=[
+ data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
+ for i in train_area
+ ]),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names),
+ scene_idxs=data_root +
+ f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/scannet-3d-18class.py b/adzoo/vad/configs/_base_/datasets/scannet-3d-18class.py
new file mode 100644
index 0000000..93da1e5
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/scannet-3d-18class.py
@@ -0,0 +1,128 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+ 'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+ 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+ 'garbagebin')
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_mask_3d=True,
+ with_seg_3d=True),
+ dict(type='GlobalAlignment', rotation_axis=2),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+ 36, 39),
+ max_cat_id=40),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.087266, 0.087266],
+ scale_ratio_range=[1.0, 1.0],
+ shift_height=True),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(
+ type='Collect3D',
+ keys=[
+ 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+ 'pts_instance_mask'
+ ])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(type='GlobalAlignment', rotation_axis=2),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(type='PointSample', num_points=40000),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(type='GlobalAlignment', rotation_axis=2),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_train.pkl',
+ pipeline=train_pipeline,
+ filter_empty_gt=False,
+ classes=class_names,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='Depth')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/scannet_seg-3d-20class.py b/adzoo/vad/configs/_base_/datasets/scannet_seg-3d-20class.py
new file mode 100644
index 0000000..cf73b09
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/scannet_seg-3d-20class.py
@@ -0,0 +1,132 @@
+# dataset settings
+dataset_type = 'ScanNetSegDataset'
+data_root = './data/scannet/'
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+ 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+ 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+ 'bathtub', 'otherfurniture')
+num_points = 8192
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+ 33, 34, 36, 39),
+ max_cat_id=40),
+ dict(
+ type='IndoorPatchPointSample',
+ num_points=num_points,
+ block_size=1.5,
+ ignore_index=len(class_names),
+ use_normalized_coord=False,
+ enlarge_size=0.2,
+ min_unique_num=None),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(type='NormalizePointsColor', color_mean=None),
+ dict(
+ # a wrapper in order to successfully call test function
+ # actually we don't perform test-time-aug
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.0,
+ flip_ratio_bev_vertical=0.0),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ use_color=True,
+ load_dim=6,
+ use_dim=[0, 1, 2, 3, 4, 5]),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_mask_3d=False,
+ with_seg_3d=True),
+ dict(
+ type='PointSegClassMapping',
+ valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+ 33, 34, 36, 39),
+ max_cat_id=40),
+ dict(
+ type='DefaultFormatBundle3D',
+ with_label=False,
+ class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ test_mode=False,
+ ignore_index=len(class_names),
+ scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names)),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'scannet_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/sunrgbd-3d-10class.py b/adzoo/vad/configs/_base_/datasets/sunrgbd-3d-10class.py
new file mode 100644
index 0000000..7121b75
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -0,0 +1,107 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+ 'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(type='LoadAnnotations3D'),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ ),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.523599, 0.523599],
+ scale_ratio_range=[0.85, 1.15],
+ shift_height=True),
+ dict(type='PointSample', num_points=20000),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=True,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ ),
+ dict(type='PointSample', num_points=20000),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='DEPTH',
+ shift_height=False,
+ load_dim=6,
+ use_dim=[0, 1, 2]),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'sunrgbd_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ filter_empty_gt=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='Depth')),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'sunrgbd_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'sunrgbd_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/waymoD5-3d-3class.py b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000..920ac15
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'LidarWaymoDataset'
+data_root = 'data/waymo-full/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'waymo_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(
+ filter_by_difficulty=[-1],
+ filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+ classes=class_names,
+ sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+ points_loader=dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=[0, 1, 2, 3, 4],
+ file_client_args=file_client_args))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_train.pkl',
+ split='training',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ # load one frame every five frames
+ load_interval=5)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/waymoD5-3d-car.py b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000..02e2627
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,143 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'waymo_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+ classes=class_names,
+ sample_groups=dict(Car=15),
+ points_loader=dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=[0, 1, 2, 3, 4],
+ file_client_args=file_client_args))
+
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args),
+ dict(type='ObjectSample', db_sampler=db_sampler),
+ dict(
+ type='RandomFlip3D',
+ sync_2d=False,
+ flip_ratio_bev_horizontal=0.5,
+ flip_ratio_bev_vertical=0.5),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05]),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_train.pkl',
+ split='training',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ # load one frame every five frames
+ load_interval=5)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/default_runtime.py b/adzoo/vad/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..4e85b69
--- /dev/null
+++ b/adzoo/vad/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/adzoo/vad/configs/_base_/models/3dssd.py b/adzoo/vad/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000..55344c7
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/3dssd.py
@@ -0,0 +1,77 @@
+model = dict(
+ type='SSD3DNet',
+ backbone=dict(
+ type='PointNet2SAMSG',
+ in_channels=4,
+ num_points=(4096, 512, (256, 256)),
+ radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+ num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+ sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+ ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+ ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+ aggregation_channels=(64, 128, 256),
+ fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+ fps_sample_range_lists=((-1), (-1), (512, -1)),
+ norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+ sa_cfg=dict(
+ type='PointSAModuleMSG',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False)),
+ bbox_head=dict(
+ type='SSD3DHead',
+ in_channels=256,
+ vote_module_cfg=dict(
+ in_channels=256,
+ num_points=256,
+ gt_per_seed=1,
+ conv_channels=(128, ),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+ with_res_feat=False,
+ vote_xyz_range=(3.0, 3.0, 2.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModuleMSG',
+ num_point=256,
+ radii=(4.8, 6.4),
+ sample_nums=(16, 32),
+ mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+ norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+ use_xyz=True,
+ normalize_xyz=False,
+ bias=True),
+ pred_layer_cfg=dict(
+ in_channels=1536,
+ shared_conv_channels=(512, 128),
+ cls_conv_channels=(128, ),
+ reg_conv_channels=(128, ),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+ bias=True),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ loss_weight=1.0),
+ center_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ corner_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+ vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(
+ sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+ test_cfg=dict(
+ nms_cfg=dict(type='nms', iou_thr=0.1),
+ sample_mod='spec',
+ score_thr=0.0,
+ per_class_proposal=True,
+ max_output_num=100))
diff --git a/adzoo/vad/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/adzoo/vad/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..fb9e0a8
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+ type='CascadeRCNN',
+ pretrained='torchvision://resnet50',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+ roi_head=dict(
+ type='CascadeRoIHead',
+ num_stages=3,
+ stage_loss_weights=[1, 0.5, 0.25],
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=[
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.05, 0.05, 0.1, 0.1]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+ loss_weight=1.0)),
+ dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.033, 0.033, 0.067, 0.067]),
+ reg_class_agnostic=True,
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+ ],
+ mask_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ mask_head=dict(
+ type='FCNMaskHead',
+ num_convs=4,
+ in_channels=256,
+ conv_out_channels=256,
+ num_classes=80,
+ loss_mask=dict(
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=2000,
+ max_num=2000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=[
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.6,
+ min_pos_iou=0.6,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False),
+ dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.7,
+ min_pos_iou=0.7,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False)
+ ]),
+ test_cfg=dict(
+ rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100,
+ mask_thr_binary=0.5)))
diff --git a/adzoo/vad/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/adzoo/vad/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
new file mode 100644
index 0000000..efdce59
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+ type='CenterPoint',
+ pts_voxel_layer=dict(
+ max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
+ pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+ pts_middle_encoder=dict(
+ type='SparseEncoder',
+ in_channels=5,
+ sparse_shape=[41, 1024, 1024],
+ output_channels=128,
+ order=('conv', 'norm', 'act'),
+ encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+ 128)),
+ encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+ block_type='basicblock'),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=256,
+ out_channels=[128, 256],
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ conv_cfg=dict(type='Conv2d', bias=False)),
+ pts_neck=dict(
+ type='SECONDFPN',
+ in_channels=[128, 256],
+ out_channels=[256, 256],
+ upsample_strides=[1, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ upsample_cfg=dict(type='deconv', bias=False),
+ use_conv_for_no_stride=True),
+ pts_bbox_head=dict(
+ type='CenterHead',
+ in_channels=sum([256, 256]),
+ tasks=[
+ dict(num_class=1, class_names=['car']),
+ dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+ dict(num_class=2, class_names=['bus', 'trailer']),
+ dict(num_class=1, class_names=['barrier']),
+ dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+ dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+ ],
+ common_heads=dict(
+ reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+ share_conv_channel=64,
+ bbox_coder=dict(
+ type='CenterPointBBoxCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_num=500,
+ score_threshold=0.1,
+ out_size_factor=8,
+ voxel_size=voxel_size[:2],
+ code_size=9),
+ separate_head=dict(
+ type='SeparateHead', init_bias=-2.19, final_kernel=3),
+ loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+ loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+ norm_bbox=True),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[1024, 1024, 40],
+ voxel_size=voxel_size,
+ out_size_factor=8,
+ dense_reg=1,
+ gaussian_overlap=0.1,
+ max_objs=500,
+ min_radius=2,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+ test_cfg=dict(
+ pts=dict(
+ post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_per_img=500,
+ max_pool_nms=False,
+ min_radius=[4, 12, 10, 1, 0.85, 0.175],
+ score_threshold=0.1,
+ out_size_factor=8,
+ voxel_size=voxel_size[:2],
+ nms_type='rotate',
+ pre_max_size=1000,
+ post_max_size=83,
+ nms_thr=0.2)))
diff --git a/adzoo/vad/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/adzoo/vad/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
new file mode 100644
index 0000000..311d763
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+ type='CenterPoint',
+ pts_voxel_layer=dict(
+ max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
+ pts_voxel_encoder=dict(
+ type='PillarFeatureNet',
+ in_channels=5,
+ feat_channels=[64],
+ with_distance=False,
+ voxel_size=(0.2, 0.2, 8),
+ norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+ legacy=False),
+ pts_middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ out_channels=[64, 128, 256],
+ layer_nums=[3, 5, 5],
+ layer_strides=[2, 2, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ conv_cfg=dict(type='Conv2d', bias=False)),
+ pts_neck=dict(
+ type='SECONDFPN',
+ in_channels=[64, 128, 256],
+ out_channels=[128, 128, 128],
+ upsample_strides=[0.5, 1, 2],
+ norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+ upsample_cfg=dict(type='deconv', bias=False),
+ use_conv_for_no_stride=True),
+ pts_bbox_head=dict(
+ type='CenterHead',
+ in_channels=sum([128, 128, 128]),
+ tasks=[
+ dict(num_class=1, class_names=['car']),
+ dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+ dict(num_class=2, class_names=['bus', 'trailer']),
+ dict(num_class=1, class_names=['barrier']),
+ dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+ dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+ ],
+ common_heads=dict(
+ reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+ share_conv_channel=64,
+ bbox_coder=dict(
+ type='CenterPointBBoxCoder',
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_num=500,
+ score_threshold=0.1,
+ out_size_factor=4,
+ voxel_size=voxel_size[:2],
+ code_size=9),
+ separate_head=dict(
+ type='SeparateHead', init_bias=-2.19, final_kernel=3),
+ loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+ loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+ norm_bbox=True),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ grid_size=[512, 512, 1],
+ voxel_size=voxel_size,
+ out_size_factor=4,
+ dense_reg=1,
+ gaussian_overlap=0.1,
+ max_objs=500,
+ min_radius=2,
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+ test_cfg=dict(
+ pts=dict(
+ post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+ max_per_img=500,
+ max_pool_nms=False,
+ min_radius=[4, 12, 10, 1, 0.85, 0.175],
+ score_threshold=0.1,
+ pc_range=[-51.2, -51.2],
+ out_size_factor=4,
+ voxel_size=voxel_size[:2],
+ nms_type='rotate',
+ pre_max_size=1000,
+ post_max_size=83,
+ nms_thr=0.2)))
diff --git a/adzoo/vad/configs/_base_/models/fcos3d.py b/adzoo/vad/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000..92ea907
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/fcos3d.py
@@ -0,0 +1,74 @@
+model = dict(
+ type='FCOSMono3D',
+ pretrained='open-mmlab://detectron2/resnet101_caffe',
+ backbone=dict(
+ type='ResNet',
+ depth=101,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='caffe'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ start_level=1,
+ add_extra_convs='on_output',
+ num_outs=5,
+ relu_before_extra_convs=True),
+ bbox_head=dict(
+ type='FCOSMono3DHead',
+ num_classes=10,
+ in_channels=256,
+ stacked_convs=2,
+ feat_channels=256,
+ use_direction_classifier=True,
+ diff_rad_by_sin=True,
+ pred_attrs=True,
+ pred_velo=True,
+ dir_offset=0.7854, # pi/4
+ strides=[8, 16, 32, 64, 128],
+ group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo
+ cls_branch=(256, ),
+ reg_branch=(
+ (256, ), # offset
+ (256, ), # depth
+ (256, ), # size
+ (256, ), # rot
+ () # velo
+ ),
+ dir_branch=(256, ),
+ attr_branch=(256, ),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_attr=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_centerness=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ norm_on_bbox=True,
+ centerness_on_reg=True,
+ center_sampling=True,
+ conv_bias=True,
+ dcn_on_last_conv=True),
+ train_cfg=dict(
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_thr=0.8,
+ score_thr=0.05,
+ min_bbox_size=0,
+ max_per_img=200))
diff --git a/adzoo/vad/configs/_base_/models/groupfree3d.py b/adzoo/vad/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000..077d049
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,71 @@
+model = dict(
+ type='GroupFree3DNet',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=3,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 288)),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True)),
+ bbox_head=dict(
+ type='GroupFree3DHead',
+ in_channels=288,
+ num_decoder_layers=6,
+ num_proposal=256,
+ transformerlayers=dict(
+ type='BaseTransformerLayer',
+ attn_cfgs=dict(
+ type='GroupFree3DMHA',
+ embed_dims=288,
+ num_heads=8,
+ attn_drop=0.1,
+ dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+ ffn_cfgs=dict(
+ embed_dims=288,
+ feedforward_channels=2048,
+ ffn_drop=0.1,
+ act_cfg=dict(type='ReLU', inplace=True)),
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+ 'norm')),
+ pred_layer_cfg=dict(
+ in_channels=288, shared_conv_channels=(288, 288), bias=True),
+ sampling_objectness_loss=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=8.0),
+ objectness_loss=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ center_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(sample_mod='kps'),
+ test_cfg=dict(
+ sample_mod='kps',
+ nms_thr=0.25,
+ score_thr=0.0,
+ per_class_proposal=True,
+ prediction_stages='last'))
diff --git a/adzoo/vad/configs/_base_/models/h3dnet.py b/adzoo/vad/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000..7605667
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/h3dnet.py
@@ -0,0 +1,341 @@
+primitive_z_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=2,
+ num_classes=18,
+ primitive_mode='z',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=1,
+ num_classes=18,
+ primitive_mode='xy',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+primitive_line_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=0,
+ num_classes=18,
+ primitive_mode='line',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=1.0,
+ loss_dst_weight=1.0),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=1.0,
+ loss_dst_weight=1.0),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+model = dict(
+ type='H3DNet',
+ backbone=dict(
+ type='MultiBackbone',
+ num_streams=4,
+ suffixes=['net0', 'net1', 'net2', 'net3'],
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+ act_cfg=dict(type='ReLU'),
+ backbones=dict(
+ type='PointNet2SASSG',
+ in_channels=4,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 256)),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True))),
+ rpn_head=dict(
+ type='VoteHead',
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=3,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=256,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ pred_layer_cfg=dict(
+ in_channels=128, shared_conv_channels=(128, 128), bias=True),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ roi_head=dict(
+ type='H3DRoIHead',
+ primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+ bbox_head=dict(
+ type='H3DBboxHead',
+ gt_per_seed=3,
+ num_proposal=256,
+ suface_matching_cfg=dict(
+ type='PointSAModule',
+ num_point=256 * 6,
+ radius=0.5,
+ num_sample=32,
+ mlp_channels=[128 + 6, 128, 64, 32],
+ use_xyz=True,
+ normalize_xyz=True),
+ line_matching_cfg=dict(
+ type='PointSAModule',
+ num_point=256 * 12,
+ radius=0.5,
+ num_sample=32,
+ mlp_channels=[128 + 12, 128, 64, 32],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ primitive_refine_channels=[128, 128, 128],
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ line_thresh=0.5,
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ cues_objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.3, 0.7],
+ reduction='mean',
+ loss_weight=5.0),
+ cues_semantic_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.3, 0.7],
+ reduction='mean',
+ loss_weight=5.0),
+ proposal_objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='none',
+ loss_weight=5.0),
+ primitive_center_loss=dict(
+ type='MSELoss', reduction='none', loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+ rpn_proposal=dict(use_nms=False),
+ rcnn=dict(
+ pos_distance_thr=0.3,
+ neg_distance_thr=0.6,
+ sample_mod='vote',
+ far_threshold=0.6,
+ near_threshold=0.3,
+ mask_surface_threshold=0.3,
+ label_surface_threshold=0.3,
+ mask_line_threshold=0.3,
+ label_line_threshold=0.3)),
+ test_cfg=dict(
+ rpn=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True,
+ use_nms=False),
+ rcnn=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True)))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_lyft.py
new file mode 100644
index 0000000..87c7fe0
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+ pts_voxel_layer=dict(
+ max_num_points=20,
+ point_cloud_range=[-80, -80, -5, 80, 80, 3],
+ max_voxels=(60000, 60000)),
+ pts_voxel_encoder=dict(
+ feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+ pts_middle_encoder=dict(output_shape=[640, 640]),
+ pts_bbox_head=dict(
+ num_classes=9,
+ anchor_generator=dict(
+ ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+ # model training settings (based on nuScenes model settings)
+ train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_nus.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_nus.py
new file mode 100644
index 0000000..e153f6c
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_nus.py
@@ -0,0 +1,96 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+ type='MVXFasterRCNN',
+ pts_voxel_layer=dict(
+ max_num_points=64,
+ point_cloud_range=[-50, -50, -5, 50, 50, 3],
+ voxel_size=voxel_size,
+ max_voxels=(30000, 40000)),
+ pts_voxel_encoder=dict(
+ type='HardVFE',
+ in_channels=4,
+ feat_channels=[64, 64],
+ with_distance=False,
+ voxel_size=voxel_size,
+ with_cluster_center=True,
+ with_voxel_center=True,
+ point_cloud_range=[-50, -50, -5, 50, 50, 3],
+ norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+ pts_middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ layer_nums=[3, 5, 5],
+ layer_strides=[2, 2, 2],
+ out_channels=[64, 128, 256]),
+ pts_neck=dict(
+ type='FPN',
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ act_cfg=dict(type='ReLU'),
+ in_channels=[64, 128, 256],
+ out_channels=256,
+ start_level=0,
+ num_outs=3),
+ pts_bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=10,
+ in_channels=256,
+ feat_channels=256,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='AlignedAnchor3DRangeGenerator',
+ ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+ scales=[1, 2, 4],
+ sizes=[
+ [0.8660, 2.5981, 1.], # 1.5/sqrt(3)
+ [0.5774, 1.7321, 1.], # 1/sqrt(3)
+ [1., 1., 1.],
+ [0.4, 0.4, 1],
+ ],
+ custom_values=[0, 0],
+ rotations=[0, 1.57],
+ reshape_out=True),
+ assigner_per_size=False,
+ diff_rad_by_sin=True,
+ dir_offset=0.7854, # pi/4
+ dir_limit_offset=0,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ pts=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_thr=0.2,
+ score_thr=0.05,
+ min_bbox_size=0,
+ max_num=500)))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
new file mode 100644
index 0000000..9cd200f
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+ pts_voxel_layer=dict(
+ max_num_points=20,
+ point_cloud_range=[-100, -100, -5, 100, 100, 3],
+ max_voxels=(60000, 60000)),
+ pts_voxel_encoder=dict(
+ feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+ pts_middle_encoder=dict(output_shape=[800, 800]),
+ pts_bbox_head=dict(
+ num_classes=9,
+ anchor_generator=dict(
+ ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+ # model training settings (based on nuScenes model settings)
+ train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
new file mode 100644
index 0000000..85076d0
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -0,0 +1,93 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+ type='VoxelNet',
+ voxel_layer=dict(
+ max_num_points=32, # max_points_per_voxel
+ point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+ voxel_size=voxel_size,
+ max_voxels=(16000, 40000) # (training, testing) max_voxels
+ ),
+ voxel_encoder=dict(
+ type='PillarFeatureNet',
+ in_channels=4,
+ feat_channels=[64],
+ with_distance=False,
+ voxel_size=voxel_size,
+ point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+ middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+ backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ layer_nums=[3, 5, 5],
+ layer_strides=[2, 2, 2],
+ out_channels=[64, 128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ in_channels=[64, 128, 256],
+ upsample_strides=[1, 2, 4],
+ out_channels=[128, 128, 128]),
+ bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=384,
+ feat_channels=384,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='Anchor3DRangeGenerator',
+ ranges=[
+ [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+ [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+ [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+ ],
+ sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.45,
+ min_pos_iou=0.45,
+ ignore_iof_thr=-1),
+ ],
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_thr=0.01,
+ score_thr=0.1,
+ min_bbox_size=0,
+ nms_pre=100,
+ max_num=50))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
new file mode 100644
index 0000000..14873ea
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+ type='MVXFasterRCNN',
+ pts_voxel_layer=dict(
+ max_num_points=20,
+ point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+ voxel_size=voxel_size,
+ max_voxels=(32000, 32000)),
+ pts_voxel_encoder=dict(
+ type='HardVFE',
+ in_channels=5,
+ feat_channels=[64],
+ with_distance=False,
+ voxel_size=voxel_size,
+ with_cluster_center=True,
+ with_voxel_center=True,
+ point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+ norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+ pts_middle_encoder=dict(
+ type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+ pts_backbone=dict(
+ type='SECOND',
+ in_channels=64,
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ layer_nums=[3, 5, 5],
+ layer_strides=[1, 2, 2],
+ out_channels=[64, 128, 256]),
+ pts_neck=dict(
+ type='SECONDFPN',
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ in_channels=[64, 128, 256],
+ upsample_strides=[1, 2, 4],
+ out_channels=[128, 128, 128]),
+ pts_bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=384,
+ feat_channels=384,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='AlignedAnchor3DRangeGenerator',
+ ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+ [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
+ [-74.88, -74.88, 0, 74.88, 74.88, 0]],
+ sizes=[
+ [2.08, 4.73, 1.77], # car
+ [0.84, 1.81, 1.77], # cyclist
+ [0.84, 0.91, 1.74] # pedestrian
+ ],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ dir_offset=0.7854, # pi/4
+ dir_limit_offset=0,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ pts=dict(
+ assigner=[
+ dict( # car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.4,
+ min_pos_iou=0.4,
+ ignore_iof_thr=-1),
+ dict( # cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ dict( # pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ ],
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ pts=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=4096,
+ nms_thr=0.25,
+ score_thr=0.1,
+ min_bbox_size=0,
+ max_num=500)))
diff --git a/adzoo/vad/configs/_base_/models/hv_second_secfpn_kitti.py b/adzoo/vad/configs/_base_/models/hv_second_secfpn_kitti.py
new file mode 100644
index 0000000..6bf18ab
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_second_secfpn_kitti.py
@@ -0,0 +1,89 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+ type='VoxelNet',
+ voxel_layer=dict(
+ max_num_points=5,
+ point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+ voxel_size=voxel_size,
+ max_voxels=(16000, 40000)),
+ voxel_encoder=dict(type='HardSimpleVFE'),
+ middle_encoder=dict(
+ type='SparseEncoder',
+ in_channels=4,
+ sparse_shape=[41, 1600, 1408],
+ order=('conv', 'norm', 'act')),
+ backbone=dict(
+ type='SECOND',
+ in_channels=256,
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ out_channels=[128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ in_channels=[128, 256],
+ upsample_strides=[1, 2],
+ out_channels=[256, 256]),
+ bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=512,
+ feat_channels=512,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='Anchor3DRangeGenerator',
+ ranges=[
+ [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+ ],
+ sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.35,
+ neg_iou_thr=0.2,
+ min_pos_iou=0.2,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.35,
+ neg_iou_thr=0.2,
+ min_pos_iou=0.2,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.45,
+ min_pos_iou=0.45,
+ ignore_iof_thr=-1),
+ ],
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_thr=0.01,
+ score_thr=0.1,
+ min_bbox_size=0,
+ nms_pre=100,
+ max_num=50))
diff --git a/adzoo/vad/configs/_base_/models/hv_second_secfpn_waymo.py b/adzoo/vad/configs/_base_/models/hv_second_secfpn_waymo.py
new file mode 100644
index 0000000..eb9bd3a
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_second_secfpn_waymo.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+ type='VoxelNet',
+ voxel_layer=dict(
+ max_num_points=10,
+ point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+ voxel_size=voxel_size,
+ max_voxels=(80000, 90000)),
+ voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+ middle_encoder=dict(
+ type='SparseEncoder',
+ in_channels=5,
+ sparse_shape=[61, 1280, 1920],
+ order=('conv', 'norm', 'act')),
+ backbone=dict(
+ type='SECOND',
+ in_channels=384,
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ out_channels=[128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+ in_channels=[128, 256],
+ upsample_strides=[1, 2],
+ out_channels=[256, 256]),
+ bbox_head=dict(
+ type='Anchor3DHead',
+ num_classes=3,
+ in_channels=512,
+ feat_channels=512,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='AlignedAnchor3DRangeGenerator',
+ ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+ [-76.8, -51.2, 0, 76.8, 51.2, 0],
+ [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+ sizes=[
+ [2.08, 4.73, 1.77], # car
+ [0.84, 0.91, 1.74], # pedestrian
+ [0.84, 1.81, 1.77] # cyclist
+ ],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ dir_offset=0.7854, # pi/4
+ dir_limit_offset=0,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ # model training and testing settings
+ train_cfg=dict(
+ assigner=[
+ dict( # car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.4,
+ min_pos_iou=0.4,
+ ignore_iof_thr=-1),
+ dict( # pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1),
+ dict( # cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ ignore_iof_thr=-1)
+ ],
+ allowed_border=0,
+ code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+ pos_weight=-1,
+ debug=False),
+ test_cfg=dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=4096,
+ nms_thr=0.25,
+ score_thr=0.1,
+ min_bbox_size=0,
+ max_num=500))
diff --git a/adzoo/vad/configs/_base_/models/imvotenet_image.py b/adzoo/vad/configs/_base_/models/imvotenet_image.py
new file mode 100644
index 0000000..981f8bc
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/imvotenet_image.py
@@ -0,0 +1,108 @@
+model = dict(
+ type='ImVoteNet',
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ img_rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ img_roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=10,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+ # model training and testing settings
+ train_cfg=dict(
+ img_rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ img_rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ img_rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ img_rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ img_rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)))
diff --git a/adzoo/vad/configs/_base_/models/mask_rcnn_r50_fpn.py b/adzoo/vad/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..c5d5e32
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+ type='MaskRCNN',
+ pretrained='torchvision://resnet50',
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ norm_eval=True,
+ style='pytorch'),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=80,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ mask_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ mask_head=dict(
+ type='FCNMaskHead',
+ num_convs=4,
+ in_channels=256,
+ conv_out_channels=256,
+ num_classes=80,
+ loss_mask=dict(
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ mask_size=28,
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100,
+ mask_thr_binary=0.5)))
diff --git a/adzoo/vad/configs/_base_/models/paconv_cuda_ssg.py b/adzoo/vad/configs/_base_/models/paconv_cuda_ssg.py
new file mode 100644
index 0000000..f513bd4
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/paconv_cuda_ssg.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+ backbone=dict(
+ sa_cfg=dict(
+ type='PAConvCUDASAModule',
+ scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/adzoo/vad/configs/_base_/models/paconv_ssg.py b/adzoo/vad/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000..1d4f1ed
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,49 @@
+# model settings
+model = dict(
+ type='EncoderDecoder3D',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=9, # [xyz, rgb, normalized_xyz]
+ num_points=(1024, 256, 64, 16),
+ radius=(None, None, None, None), # use kNN instead of ball query
+ num_samples=(32, 32, 32, 32),
+ sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+ 512)),
+ fp_channels=(),
+ norm_cfg=dict(type='BN2d', momentum=0.1),
+ sa_cfg=dict(
+ type='PAConvSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False,
+ paconv_num_kernels=[16, 16, 16],
+ paconv_kernel_input='w_neighbor',
+ scorenet_input='w_neighbor_dist',
+ scorenet_cfg=dict(
+ mlp_channels=[16, 16, 16],
+ score_norm='softmax',
+ temp_factor=1.0,
+ last_bn=False))),
+ decode_head=dict(
+ type='PAConvHead',
+ # PAConv model's decoder takes skip connections from beckbone
+ # different from PointNet++, it also concats input features in the last
+ # level of decoder, leading to `128 + 6` as the channel number
+ fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+ (128 + 6, 128, 128, 128)),
+ channels=128,
+ dropout_ratio=0.5,
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ act_cfg=dict(type='ReLU'),
+ loss_decode=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ class_weight=None, # should be modified with dataset
+ loss_weight=1.0)),
+ # correlation loss to regularize PAConv's kernel weights
+ loss_regularization=dict(
+ type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide'))
diff --git a/adzoo/vad/configs/_base_/models/parta2.py b/adzoo/vad/configs/_base_/models/parta2.py
new file mode 100644
index 0000000..6c5ae9a
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/parta2.py
@@ -0,0 +1,201 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+ type='PartA2',
+ voxel_layer=dict(
+ max_num_points=5, # max_points_per_voxel
+ point_cloud_range=point_cloud_range,
+ voxel_size=voxel_size,
+ max_voxels=(16000, 40000) # (training, testing) max_voxels
+ ),
+ voxel_encoder=dict(type='HardSimpleVFE'),
+ middle_encoder=dict(
+ type='SparseUNet',
+ in_channels=4,
+ sparse_shape=[41, 1600, 1408],
+ order=('conv', 'norm', 'act')),
+ backbone=dict(
+ type='SECOND',
+ in_channels=256,
+ layer_nums=[5, 5],
+ layer_strides=[1, 2],
+ out_channels=[128, 256]),
+ neck=dict(
+ type='SECONDFPN',
+ in_channels=[128, 256],
+ upsample_strides=[1, 2],
+ out_channels=[256, 256]),
+ rpn_head=dict(
+ type='PartA2RPNHead',
+ num_classes=3,
+ in_channels=512,
+ feat_channels=512,
+ use_direction_classifier=True,
+ anchor_generator=dict(
+ type='Anchor3DRangeGenerator',
+ ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+ [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+ sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+ rotations=[0, 1.57],
+ reshape_out=False),
+ diff_rad_by_sin=True,
+ assigner_per_size=True,
+ assign_per_class=True,
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ loss_cls=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+ loss_dir=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+ roi_head=dict(
+ type='PartAggregationROIHead',
+ num_classes=3,
+ semantic_head=dict(
+ type='PointwiseSemanticHead',
+ in_channels=16,
+ extra_width=0.2,
+ seg_score_thr=0.3,
+ num_classes=3,
+ loss_seg=dict(
+ type='FocalLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ gamma=2.0,
+ alpha=0.25,
+ loss_weight=1.0),
+ loss_part=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+ seg_roi_extractor=dict(
+ type='Single3DRoIAwareExtractor',
+ roi_layer=dict(
+ type='RoIAwarePool3d',
+ out_size=14,
+ max_pts_per_voxel=128,
+ mode='max')),
+ part_roi_extractor=dict(
+ type='Single3DRoIAwareExtractor',
+ roi_layer=dict(
+ type='RoIAwarePool3d',
+ out_size=14,
+ max_pts_per_voxel=128,
+ mode='avg')),
+ bbox_head=dict(
+ type='PartA2BboxHead',
+ num_classes=3,
+ seg_in_channels=16,
+ part_in_channels=4,
+ seg_conv_channels=[64, 64],
+ part_conv_channels=[64, 64],
+ merge_conv_channels=[128, 128],
+ down_conv_channels=[128, 256],
+ bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+ shared_fc_channels=[256, 512, 512, 512],
+ cls_channels=[256, 256],
+ reg_channels=[256, 256],
+ dropout_ratio=0.1,
+ roi_feat_size=14,
+ with_corner_loss=True,
+ loss_bbox=dict(
+ type='SmoothL1Loss',
+ beta=1.0 / 9.0,
+ reduction='sum',
+ loss_weight=1.0),
+ loss_cls=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=True,
+ reduction='sum',
+ loss_weight=1.0))),
+ # model training and testing settings
+ train_cfg=dict(
+ rpn=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.35,
+ min_pos_iou=0.35,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(type='BboxOverlapsNearest3D'),
+ pos_iou_thr=0.6,
+ neg_iou_thr=0.45,
+ min_pos_iou=0.45,
+ ignore_iof_thr=-1)
+ ],
+ allowed_border=0,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=9000,
+ nms_post=512,
+ max_num=512,
+ nms_thr=0.8,
+ score_thr=0,
+ use_rotate_nms=False),
+ rcnn=dict(
+ assigner=[
+ dict( # for Pedestrian
+ type='MaxIoUAssigner',
+ iou_calculator=dict(
+ type='BboxOverlaps3D', coordinate='lidar'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.55,
+ min_pos_iou=0.55,
+ ignore_iof_thr=-1),
+ dict( # for Cyclist
+ type='MaxIoUAssigner',
+ iou_calculator=dict(
+ type='BboxOverlaps3D', coordinate='lidar'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.55,
+ min_pos_iou=0.55,
+ ignore_iof_thr=-1),
+ dict( # for Car
+ type='MaxIoUAssigner',
+ iou_calculator=dict(
+ type='BboxOverlaps3D', coordinate='lidar'),
+ pos_iou_thr=0.55,
+ neg_iou_thr=0.55,
+ min_pos_iou=0.55,
+ ignore_iof_thr=-1)
+ ],
+ sampler=dict(
+ type='IoUNegPiecewiseSampler',
+ num=128,
+ pos_fraction=0.55,
+ neg_piece_fractions=[0.8, 0.2],
+ neg_iou_piece_thrs=[0.55, 0.1],
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False,
+ return_iou=True),
+ cls_pos_thr=0.75,
+ cls_neg_thr=0.25)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1024,
+ nms_post=100,
+ max_num=100,
+ nms_thr=0.7,
+ score_thr=0,
+ use_rotate_nms=True),
+ rcnn=dict(
+ use_rotate_nms=True,
+ use_raw_score=True,
+ nms_thr=0.01,
+ score_thr=0.1)))
diff --git a/adzoo/vad/configs/_base_/models/pointnet2_msg.py b/adzoo/vad/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000..222ab88
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+ backbone=dict(
+ _delete_=True,
+ type='PointNet2SAMSG',
+ in_channels=6, # [xyz, rgb], should be modified with dataset
+ num_points=(1024, 256, 64, 16),
+ radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+ num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+ sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+ 128)),
+ ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+ (256, 384, 512))),
+ aggregation_channels=(None, None, None, None),
+ fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+ fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+ dilated_group=(False, False, False, False),
+ out_indices=(0, 1, 2, 3),
+ sa_cfg=dict(
+ type='PointSAModuleMSG',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False)),
+ decode_head=dict(
+ fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+ (128, 128, 128, 128))))
diff --git a/adzoo/vad/configs/_base_/models/pointnet2_ssg.py b/adzoo/vad/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000..58b4c24
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+ type='EncoderDecoder3D',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=6, # [xyz, rgb], should be modified with dataset
+ num_points=(1024, 256, 64, 16),
+ radius=(0.1, 0.2, 0.4, 0.8),
+ num_samples=(32, 32, 32, 32),
+ sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+ 512)),
+ fp_channels=(),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=False)),
+ decode_head=dict(
+ type='PointNet2Head',
+ fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+ (128, 128, 128, 128)),
+ channels=128,
+ dropout_ratio=0.5,
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ act_cfg=dict(type='ReLU'),
+ loss_decode=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ class_weight=None, # should be modified with dataset
+ loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide'))
diff --git a/adzoo/vad/configs/_base_/models/votenet.py b/adzoo/vad/configs/_base_/models/votenet.py
new file mode 100644
index 0000000..129339d
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+ type='VoteNet',
+ backbone=dict(
+ type='PointNet2SASSG',
+ in_channels=4,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 256)),
+ norm_cfg=dict(type='BN2d'),
+ sa_cfg=dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True)),
+ bbox_head=dict(
+ type='VoteHead',
+ vote_module_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=3,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ type='PointSAModule',
+ num_point=256,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ pred_layer_cfg=dict(
+ in_channels=128, shared_conv_channels=(128, 128), bias=True),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(
+ pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+ test_cfg=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True))
diff --git a/adzoo/vad/configs/_base_/schedules/cosine.py b/adzoo/vad/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000..69cb7df
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/cosine.py
@@ -0,0 +1,20 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003 # max learning rate
+optimizer = dict(
+ type='AdamW',
+ lr=lr,
+ betas=(0.95, 0.99), # the momentum is change during training
+ weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+
+lr_config = dict(
+ policy='CosineAnnealing',
+ warmup='linear',
+ warmup_iters=1000,
+ warmup_ratio=1.0 / 10,
+ min_lr_ratio=1e-5)
+
+momentum_config = None
+
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/vad/configs/_base_/schedules/cyclic_20e.py b/adzoo/vad/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000..704740e
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,24 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+ policy='cyclic',
+ target_ratio=(10, 1e-4),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+momentum_config = dict(
+ policy='cyclic',
+ target_ratio=(0.85 / 0.95, 1),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/adzoo/vad/configs/_base_/schedules/cyclic_40e.py b/adzoo/vad/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000..4a711ac
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,31 @@
+# The schedule is usually used by models trained on KITTI dataset
+
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the offcial AdamW optimizer implemented by PyTorch.
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa
+# We implement them in mmcv, for more details, please refer to
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa
+lr_config = dict(
+ policy='cyclic',
+ target_ratio=(10, 1e-4),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+momentum_config = dict(
+ policy='cyclic',
+ target_ratio=(0.85 / 0.95, 1),
+ cyclic_times=1,
+ step_ratio_up=0.4,
+)
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/vad/configs/_base_/schedules/mmdet_schedule_1x.py b/adzoo/vad/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000..13b3783
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/adzoo/vad/configs/_base_/schedules/schedule_2x.py b/adzoo/vad/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000..afde799
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,14 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=1000,
+ warmup_ratio=1.0 / 1000,
+ step=[20, 23])
+momentum_config = None
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/adzoo/vad/configs/_base_/schedules/schedule_3x.py b/adzoo/vad/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000..115cd26
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008 # max learning rate
+optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/adzoo/vad/configs/_base_/schedules/seg_cosine_150e.py b/adzoo/vad/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000..04b44e5
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/adzoo/vad/configs/_base_/schedules/seg_cosine_200e.py b/adzoo/vad/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000..6a49484
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on ScanNet dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/adzoo/vad/configs/_base_/schedules/seg_cosine_50e.py b/adzoo/vad/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000..975a8f9
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/adzoo/vad/configs/datasets/custom_lyft-3d.py b/adzoo/vad/configs/datasets/custom_lyft-3d.py
new file mode 100644
index 0000000..5a95d89
--- /dev/null
+++ b/adzoo/vad/configs/datasets/custom_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+ 'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'CustomLyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/lyft/': 's3://lyft/lyft/',
+# 'data/lyft/': 's3://lyft/lyft/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'lyft_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
diff --git a/adzoo/vad/configs/datasets/custom_nus-3d.py b/adzoo/vad/configs/datasets/custom_nus-3d.py
new file mode 100644
index 0000000..af81f9b
--- /dev/null
+++ b/adzoo/vad/configs/datasets/custom_nus-3d.py
@@ -0,0 +1,141 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset_eval_modified'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+ use_lidar=True,
+ use_camera=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel',
+# path_mapping=dict({
+# './data/nuscenes/': 's3://nuscenes/nuscenes/',
+# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
+# }))
+train_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[-0.3925, 0.3925],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='PointShuffle'),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1333, 800),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='GlobalRotScaleTrans',
+ rot_range=[0, 0],
+ scale_ratio_range=[1., 1.],
+ translation_std=[0, 0, 0]),
+ dict(type='RandomFlip3D'),
+ dict(
+ type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=file_client_args),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_train.pkl',
+ pipeline=train_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR'),
+ val=dict(
+ type=dataset_type,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'nuscenes_infos_val.pkl',
+ pipeline=test_pipeline,
+ classes=class_names,
+ modality=input_modality,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/datasets/custom_waymo-3d.py b/adzoo/vad/configs/datasets/custom_waymo-3d.py
new file mode 100644
index 0000000..4100e13
--- /dev/null
+++ b/adzoo/vad/configs/datasets/custom_waymo-3d.py
@@ -0,0 +1,112 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'CustomWaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+img_norm_cfg = dict(
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=False, use_camera=True)
+db_sampler = dict(
+ data_root=data_root,
+ info_path=data_root + 'waymo_dbinfos_train.pkl',
+ rate=1.0,
+ prepare=dict(
+ filter_by_difficulty=[-1],
+ filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+ classes=class_names,
+ sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+ points_loader=dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=[0, 1, 2, 3, 4],
+ file_client_args=file_client_args))
+
+
+
+train_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='PhotoMetricDistortionMultiViewImage'),
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+ dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+ dict(type='ObjectNameFilter', classes=class_names),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(type='DefaultFormatBundle3D', class_names=class_names),
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+
+test_pipeline = [
+ dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+ dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+ dict(type='PadMultiViewImage', size_divisor=32),
+ dict(
+ type='MultiScaleFlipAug3D',
+ img_scale=(1920, 1280),
+ pts_scale_ratio=1,
+ flip=False,
+ transforms=[
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=class_names,
+ with_label=False),
+ dict(type='CustomCollect3D', keys=['img'])
+ ])
+]
+
+
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=2,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_train.pkl',
+ split='training',
+ pipeline=train_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=False,
+ # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+ # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+ box_type_3d='LiDAR',
+ # load one frame every five frames
+ load_interval=5)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ ann_file=data_root + 'waymo_infos_val.pkl',
+ split='training',
+ pipeline=test_pipeline,
+ modality=input_modality,
+ classes=class_names,
+ test_mode=True,
+ box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
diff --git a/adzoo/vad/create_data.py b/adzoo/vad/create_data.py
new file mode 100644
index 0000000..f2b0cc1
--- /dev/null
+++ b/adzoo/vad/create_data.py
@@ -0,0 +1,305 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+from data_converter.create_gt_database import create_groundtruth_database
+from data_converter import nuscenes_converter as nuscenes_converter
+from data_converter import lyft_converter as lyft_converter
+from data_converter import kitti_converter as kitti
+from data_converter import indoor_converter as indoor
+import argparse
+from os import path as osp
+import sys
+sys.path.append('.')
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+ """Prepare data related to Kitti dataset.
+
+ Related data consists of '.pkl' files recording basic infos,
+ 2D annotations and groundtruth database.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ out_dir (str): Output directory of the groundtruth database info.
+ """
+ kitti.create_kitti_info_file(root_path, info_prefix)
+ kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+ info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+ info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+ info_trainval_path = osp.join(root_path,
+ f'{info_prefix}_infos_trainval.pkl')
+ info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+ kitti.export_2d_annotation(root_path, info_train_path)
+ kitti.export_2d_annotation(root_path, info_val_path)
+ kitti.export_2d_annotation(root_path, info_trainval_path)
+ kitti.export_2d_annotation(root_path, info_test_path)
+
+ create_groundtruth_database(
+ 'KittiDataset',
+ root_path,
+ info_prefix,
+ f'{out_dir}/{info_prefix}_infos_train.pkl',
+ relative_path=False,
+ mask_anno_path='instances_train.json',
+ with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+ can_bus_root_path,
+ info_prefix,
+ version,
+ dataset_name,
+ out_dir,
+ max_sweeps=10):
+ """Prepare data related to nuScenes dataset.
+
+ Related data consists of '.pkl' files recording basic infos,
+ 2D annotations and groundtruth database.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ dataset_name (str): The dataset class name.
+ out_dir (str): Output directory of the groundtruth database info.
+ max_sweeps (int): Number of input consecutive frames. Default: 10
+ """
+ nuscenes_converter.create_nuscenes_infos(
+ root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+ if version == 'v1.0-test':
+ info_test_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_test_path, version=version)
+ else:
+ info_train_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+ info_val_path = osp.join(
+ out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_train_path, version=version)
+ nuscenes_converter.export_2d_annotation(
+ root_path, info_val_path, version=version)
+ # create_groundtruth_database(dataset_name, root_path, info_prefix,
+ # f'{out_dir}/{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+ """Prepare data related to Lyft dataset.
+
+ Related data consists of '.pkl' files recording basic infos.
+ Although the ground truth database and 2D annotations are not used in
+ Lyft, it can also be generated like nuScenes.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ max_sweeps (int, optional): Number of input consecutive frames.
+ Defaults to 10.
+ """
+ lyft_converter.create_lyft_infos(
+ root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+ """Prepare the info file for scannet dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ """
+ indoor.create_indoor_info_file(
+ root_path, info_prefix, out_dir, workers=workers)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+ """Prepare the info file for s3dis dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ """
+ indoor.create_indoor_info_file(
+ root_path, info_prefix, out_dir, workers=workers)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+ """Prepare the info file for sunrgbd dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ """
+ indoor.create_indoor_info_file(
+ root_path, info_prefix, out_dir, workers=workers)
+
+
+def waymo_data_prep(root_path,
+ info_prefix,
+ version,
+ out_dir,
+ workers,
+ max_sweeps=5):
+ """Prepare the info file for waymo dataset.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ out_dir (str): Output directory of the generated info file.
+ workers (int): Number of threads to be used.
+ max_sweeps (int): Number of input consecutive frames. Default: 5 \
+ Here we store pose information of these frames for later use.
+ """
+ from tools.data_converter import waymo_converter as waymo
+
+ splits = ['training', 'validation', 'testing']
+
+ for i, split in enumerate(splits):
+ load_dir = osp.join(root_path, 'waymo_format', split)
+ if split == 'validation':
+ save_dir = osp.join(out_dir, 'kitti_format', 'training')
+ else:
+ save_dir = osp.join(out_dir, 'kitti_format', split)
+ converter = waymo.Waymo2KITTI(
+ load_dir,
+ save_dir,
+ prefix=str(i),
+ workers=workers,
+ test_mode=(split == 'test'))
+ converter.convert()
+ # Generate waymo infos
+ out_dir = osp.join(out_dir, 'kitti_format')
+ kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps)
+
+ create_groundtruth_database(
+ 'WaymoDataset',
+ out_dir,
+ info_prefix,
+ f'{out_dir}/{info_prefix}_infos_train.pkl',
+ relative_path=False,
+ with_mask=False)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+ '--root-path',
+ type=str,
+ default='./data/kitti',
+ help='specify the root path of dataset')
+parser.add_argument(
+ '--canbus',
+ type=str,
+ default='./data',
+ help='specify the root path of nuScenes canbus')
+parser.add_argument(
+ '--version',
+ type=str,
+ default='v1.0',
+ required=False,
+ help='specify the dataset version, no need for kitti')
+parser.add_argument(
+ '--max-sweeps',
+ type=int,
+ default=10,
+ required=False,
+ help='specify sweeps of lidar per example')
+parser.add_argument(
+ '--out-dir',
+ type=str,
+ default='./data/kitti',
+ required='False',
+ help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+ '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+ if args.dataset == 'kitti':
+ kitti_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=args.version,
+ out_dir=args.out_dir)
+ elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+ train_version = f'{args.version}-trainval'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ test_version = f'{args.version}-test'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=test_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+ train_version = f'{args.version}'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'lyft':
+ train_version = f'{args.version}-train'
+ lyft_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ max_sweeps=args.max_sweeps)
+ test_version = f'{args.version}-test'
+ lyft_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=test_version,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'waymo':
+ waymo_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ version=args.version,
+ out_dir=args.out_dir,
+ workers=args.workers,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'scannet':
+ scannet_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ out_dir=args.out_dir,
+ workers=args.workers)
+ elif args.dataset == 's3dis':
+ s3dis_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ out_dir=args.out_dir,
+ workers=args.workers)
+ elif args.dataset == 'sunrgbd':
+ sunrgbd_data_prep(
+ root_path=args.root_path,
+ info_prefix=args.extra_tag,
+ out_dir=args.out_dir,
+ workers=args.workers)
diff --git a/adzoo/vad/data_converter/__init__.py b/adzoo/vad/data_converter/__init__.py
new file mode 100644
index 0000000..ef101fe
--- /dev/null
+++ b/adzoo/vad/data_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/adzoo/vad/data_converter/create_gt_database.py b/adzoo/vad/data_converter/create_gt_database.py
new file mode 100644
index 0000000..7317ced
--- /dev/null
+++ b/adzoo/vad/data_converter/create_gt_database.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pickle
+from mmcv import track_iter_progress
+from mmcv.ops import roi_align
+from os import path as osp
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmdet3d.core.bbox import box_np_ops as box_np_ops
+from mmdet3d.datasets import build_dataset
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+ if isinstance(mask_ann, list):
+ # polygon -- a single object might consist of multiple parts
+ # we merge all parts into one mask rle code
+ rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+ rle = maskUtils.merge(rles)
+ elif isinstance(mask_ann['counts'], list):
+ # uncompressed RLE
+ rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+ else:
+ # rle
+ rle = mask_ann
+ mask = maskUtils.decode(rle)
+ return mask
+
+
+def _parse_coco_ann_info(ann_info):
+ gt_bboxes = []
+ gt_labels = []
+ gt_bboxes_ignore = []
+ gt_masks_ann = []
+
+ for i, ann in enumerate(ann_info):
+ if ann.get('ignore', False):
+ continue
+ x1, y1, w, h = ann['bbox']
+ if ann['area'] <= 0:
+ continue
+ bbox = [x1, y1, x1 + w, y1 + h]
+ if ann.get('iscrowd', False):
+ gt_bboxes_ignore.append(bbox)
+ else:
+ gt_bboxes.append(bbox)
+ gt_masks_ann.append(ann['segmentation'])
+
+ if gt_bboxes:
+ gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+ gt_labels = np.array(gt_labels, dtype=np.int64)
+ else:
+ gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+ gt_labels = np.array([], dtype=np.int64)
+
+ if gt_bboxes_ignore:
+ gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+ else:
+ gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+ ann = dict(
+ bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+ return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+ import torch
+ from torch.nn.modules.utils import _pair
+ device = pos_proposals.device
+ num_pos = pos_proposals.size(0)
+ fake_inds = (
+ torch.arange(num_pos,
+ device=device).to(dtype=pos_proposals.dtype)[:, None])
+ rois = torch.cat([fake_inds, pos_proposals], dim=1) # Nx5
+ mask_size = _pair(28)
+ rois = rois.to(device=device)
+ gt_masks_th = (
+ torch.from_numpy(gt_masks).to(device).index_select(
+ 0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+ # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+ targets = (
+ roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+ return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+ num_pos = pos_proposals.shape[0]
+ masks = []
+ img_patches = []
+ for i in range(num_pos):
+ gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+ bbox = pos_proposals[i, :].astype(np.int32)
+ x1, y1, x2, y2 = bbox
+ w = np.maximum(x2 - x1 + 1, 1)
+ h = np.maximum(y2 - y1 + 1, 1)
+
+ mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+ masked_img = gt_mask[..., None] * org_img
+ img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+ img_patches.append(img_patch)
+ masks.append(mask_patch)
+ return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+ data_path,
+ info_prefix,
+ info_path=None,
+ mask_anno_path=None,
+ used_classes=None,
+ database_save_path=None,
+ db_info_save_path=None,
+ relative_path=True,
+ add_rgb=False,
+ lidar_only=False,
+ bev_only=False,
+ coors_range=None,
+ with_mask=False):
+ """Given the raw data, generate the ground truth database.
+
+ Args:
+ dataset_class_name (str): Name of the input dataset.
+ data_path (str): Path of the data.
+ info_prefix (str): Prefix of the info file.
+ info_path (str): Path of the info file.
+ Default: None.
+ mask_anno_path (str): Path of the mask_anno.
+ Default: None.
+ used_classes (list[str]): Classes have been used.
+ Default: None.
+ database_save_path (str): Path to save database.
+ Default: None.
+ db_info_save_path (str): Path to save db_info.
+ Default: None.
+ relative_path (bool): Whether to use relative path.
+ Default: True.
+ with_mask (bool): Whether to use mask.
+ Default: False.
+ """
+ print(f'Create GT Database of {dataset_class_name}')
+ dataset_cfg = dict(
+ type=dataset_class_name, data_root=data_path, ann_file=info_path)
+ if dataset_class_name == 'KittiDataset':
+ file_client_args = dict(backend='disk')
+ dataset_cfg.update(
+ test_mode=False,
+ split='training',
+ modality=dict(
+ use_lidar=True,
+ use_depth=False,
+ use_lidar_intensity=True,
+ use_camera=with_mask,
+ ),
+ pipeline=[
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=4,
+ use_dim=4,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args)
+ ])
+
+ elif dataset_class_name == 'NuScenesDataset':
+ dataset_cfg.update(
+ use_valid_flag=True,
+ pipeline=[
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ use_dim=[0, 1, 2, 3, 4],
+ pad_empty_sweeps=True,
+ remove_close=True),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True)
+ ])
+
+ elif dataset_class_name == 'WaymoDataset':
+ file_client_args = dict(backend='disk')
+ dataset_cfg.update(
+ test_mode=False,
+ split='training',
+ modality=dict(
+ use_lidar=True,
+ use_depth=False,
+ use_lidar_intensity=True,
+ use_camera=False,
+ ),
+ pipeline=[
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=6,
+ use_dim=5,
+ file_client_args=file_client_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=True,
+ with_label_3d=True,
+ file_client_args=file_client_args)
+ ])
+
+ dataset = build_dataset(dataset_cfg)
+
+ if database_save_path is None:
+ database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+ if db_info_save_path is None:
+ db_info_save_path = osp.join(data_path,
+ f'{info_prefix}_dbinfos_train.pkl')
+ mmcv.mkdir_or_exist(database_save_path)
+ all_db_infos = dict()
+ if with_mask:
+ coco = COCO(osp.join(data_path, mask_anno_path))
+ imgIds = coco.getImgIds()
+ file2id = dict()
+ for i in imgIds:
+ info = coco.loadImgs([i])[0]
+ file2id.update({info['file_name']: i})
+
+ group_counter = 0
+ for j in track_iter_progress(list(range(len(dataset)))):
+ input_dict = dataset.get_data_info(j)
+ dataset.pre_pipeline(input_dict)
+ example = dataset.pipeline(input_dict)
+ annos = example['ann_info']
+ image_idx = example['sample_idx']
+ points = example['points'].tensor.numpy()
+ gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
+ names = annos['gt_names']
+ group_dict = dict()
+ if 'group_ids' in annos:
+ group_ids = annos['group_ids']
+ else:
+ group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+ difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+ if 'difficulty' in annos:
+ difficulty = annos['difficulty']
+
+ num_obj = gt_boxes_3d.shape[0]
+ point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+ if with_mask:
+ # prepare masks
+ gt_boxes = annos['gt_bboxes']
+ img_path = osp.split(example['img_info']['filename'])[-1]
+ if img_path not in file2id.keys():
+ print(f'skip image {img_path} for empty mask')
+ continue
+ img_id = file2id[img_path]
+ kins_annIds = coco.getAnnIds(imgIds=img_id)
+ kins_raw_info = coco.loadAnns(kins_annIds)
+ kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+ h, w = annos['img_shape'][:2]
+ gt_masks = [
+ _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+ ]
+ # get mask inds based on iou mapping
+ bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+ mask_inds = bbox_iou.argmax(axis=0)
+ valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+ # mask the image
+ # use more precise crop when it is ready
+ # object_img_patches = np.ascontiguousarray(
+ # np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+ # crop image patches using roi_align
+ # object_img_patches = crop_image_patch_v2(
+ # torch.Tensor(gt_boxes),
+ # torch.Tensor(mask_inds).long(), object_img_patches)
+ object_img_patches, object_masks = crop_image_patch(
+ gt_boxes, gt_masks, mask_inds, annos['img'])
+
+ for i in range(num_obj):
+ filename = f'{image_idx}_{names[i]}_{i}.bin'
+ abs_filepath = osp.join(database_save_path, filename)
+ rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+ # save point clouds and image patches for each object
+ gt_points = points[point_indices[:, i]]
+ gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+ if with_mask:
+ if object_masks[i].sum() == 0 or not valid_inds[i]:
+ # Skip object for empty or invalid mask
+ continue
+ img_patch_path = abs_filepath + '.png'
+ mask_patch_path = abs_filepath + '.mask.png'
+ mmcv.imwrite(object_img_patches[i], img_patch_path)
+ mmcv.imwrite(object_masks[i], mask_patch_path)
+
+ with open(abs_filepath, 'w') as f:
+ gt_points.tofile(f)
+
+ if (used_classes is None) or names[i] in used_classes:
+ db_info = {
+ 'name': names[i],
+ 'path': rel_filepath,
+ 'image_idx': image_idx,
+ 'gt_idx': i,
+ 'box3d_lidar': gt_boxes_3d[i],
+ 'num_points_in_gt': gt_points.shape[0],
+ 'difficulty': difficulty[i],
+ }
+ local_group_id = group_ids[i]
+ # if local_group_id >= 0:
+ if local_group_id not in group_dict:
+ group_dict[local_group_id] = group_counter
+ group_counter += 1
+ db_info['group_id'] = group_dict[local_group_id]
+ if 'score' in annos:
+ db_info['score'] = annos['score'][i]
+ if with_mask:
+ db_info.update({'box2d_camera': gt_boxes[i]})
+ if names[i] in all_db_infos:
+ all_db_infos[names[i]].append(db_info)
+ else:
+ all_db_infos[names[i]] = [db_info]
+
+ for k, v in all_db_infos.items():
+ print(f'load {len(v)} {k} database infos')
+
+ with open(db_info_save_path, 'wb') as f:
+ pickle.dump(all_db_infos, f)
diff --git a/adzoo/vad/data_converter/vad_nuscenes_converter.py b/adzoo/vad/data_converter/vad_nuscenes_converter.py
new file mode 100644
index 0000000..338051c
--- /dev/null
+++ b/adzoo/vad/data_converter/vad_nuscenes_converter.py
@@ -0,0 +1,1005 @@
+import os
+import math
+import copy
+import argparse
+from os import path as osp
+from collections import OrderedDict
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.data_classes import Box
+from shapely.geometry import MultiPoint, box
+from mmdet3d.datasets import NuScenesDataset
+from nuscenes.utils.geometry_utils import view_points
+from mmdet3d.core.bbox.box_np_ops import points_cam2img
+from nuscenes.utils.geometry_utils import transform_matrix
+
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+ 'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+ 'pedestrian.moving', 'pedestrian.standing',
+ 'pedestrian.sitting_lying_down', 'vehicle.moving',
+ 'vehicle.parked', 'vehicle.stopped', 'None')
+
+ego_width, ego_length = 1.85, 4.084
+
+def quart_to_rpy(qua):
+ x, y, z, w = qua
+ roll = math.atan2(2 * (w * x + y * z), 1 - 2 * (x * x + y * y))
+ pitch = math.asin(2 * (w * y - x * z))
+ yaw = math.atan2(2 * (w * z + x * y), 1 - 2 * (z * z + y * y))
+ return roll, pitch, yaw
+
+def locate_message(utimes, utime):
+ i = np.searchsorted(utimes, utime)
+ if i == len(utimes) or (i > 0 and utime - utimes[i-1] < utimes[i] - utime):
+ i -= 1
+ return i
+
+
+def create_nuscenes_infos(root_path,
+ out_path,
+ can_bus_root_path,
+ info_prefix,
+ version='v1.0-trainval',
+ max_sweeps=10):
+ """Create info file of nuscene dataset.
+
+ Given the raw data, generate its related info file in pkl format.
+
+ Args:
+ root_path (str): Path of the data root.
+ info_prefix (str): Prefix of the info file to be generated.
+ version (str): Version of the data.
+ Default: 'v1.0-trainval'
+ max_sweeps (int): Max number of sweeps.
+ Default: 10
+ """
+ from nuscenes.nuscenes import NuScenes
+ from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+ print(version, root_path)
+ nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+ nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+ from nuscenes.utils import splits
+ available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+ assert version in available_vers
+ if version == 'v1.0-trainval':
+ train_scenes = splits.train
+ val_scenes = splits.val
+ elif version == 'v1.0-test':
+ train_scenes = splits.test
+ val_scenes = []
+ elif version == 'v1.0-mini':
+ train_scenes = splits.mini_train
+ val_scenes = splits.mini_val
+ else:
+ raise ValueError('unknown')
+
+ # filter existing scenes.
+ available_scenes = get_available_scenes(nusc)
+ available_scene_names = [s['name'] for s in available_scenes]
+ train_scenes = list(
+ filter(lambda x: x in available_scene_names, train_scenes))
+ val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+ train_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in train_scenes
+ ])
+ val_scenes = set([
+ available_scenes[available_scene_names.index(s)]['token']
+ for s in val_scenes
+ ])
+
+ test = 'test' in version
+ if test:
+ print('test scene: {}'.format(len(train_scenes)))
+ else:
+ print('train scene: {}, val scene: {}'.format(
+ len(train_scenes), len(val_scenes)))
+
+ train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+ nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+ metadata = dict(version=version)
+ if test:
+ print('test sample: {}'.format(len(train_nusc_infos)))
+ data = dict(infos=train_nusc_infos, metadata=metadata)
+ info_path = osp.join(out_path,
+ '{}_infos_temporal_test.pkl'.format(info_prefix))
+ mmcv.dump(data, info_path)
+ else:
+ print('train sample: {}, val sample: {}'.format(
+ len(train_nusc_infos), len(val_nusc_infos)))
+ data = dict(infos=train_nusc_infos, metadata=metadata)
+ info_path = osp.join(out_path,
+ '{}_infos_temporal_train.pkl'.format(info_prefix))
+ mmcv.dump(data, info_path)
+ data['infos'] = val_nusc_infos
+ info_val_path = osp.join(out_path,
+ '{}_infos_temporal_val.pkl'.format(info_prefix))
+ mmcv.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+ """Get available scenes from the input nuscenes class.
+
+ Given the raw data, get the information of available scenes for
+ further info generation.
+
+ Args:
+ nusc (class): Dataset class in the nuScenes dataset.
+
+ Returns:
+ available_scenes (list[dict]): List of basic information for the
+ available scenes.
+ """
+ available_scenes = []
+ print('total scene num: {}'.format(len(nusc.scene)))
+ for scene in nusc.scene:
+ scene_token = scene['token']
+ scene_rec = nusc.get('scene', scene_token)
+ sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+ sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+ has_more_frames = True
+ scene_not_exist = False
+ while has_more_frames:
+ lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+ lidar_path = str(lidar_path)
+ if os.getcwd() in lidar_path:
+ # path from lyftdataset is absolute path
+ lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+ # relative path
+ if not mmcv.is_filepath(lidar_path):
+ scene_not_exist = True
+ break
+ else:
+ break
+ if scene_not_exist:
+ continue
+ available_scenes.append(scene)
+ print('exist scene num: {}'.format(len(available_scenes)))
+ return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+ scene_name = nusc.get('scene', sample['scene_token'])['name']
+ sample_timestamp = sample['timestamp']
+ try:
+ pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+ except:
+ return np.zeros(18) # server scenes do not have can bus information.
+ can_bus = []
+ # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+ last_pose = pose_list[0]
+ for i, pose in enumerate(pose_list):
+ if pose['utime'] > sample_timestamp:
+ break
+ last_pose = pose
+ _ = last_pose.pop('utime') # useless
+ pos = last_pose.pop('pos')
+ rotation = last_pose.pop('orientation')
+ can_bus.extend(pos)
+ can_bus.extend(rotation)
+ for key in last_pose.keys():
+ can_bus.extend(pose[key]) # 16 elements
+ can_bus.extend([0., 0.])
+ return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+ nusc_can_bus,
+ train_scenes,
+ val_scenes,
+ test=False,
+ max_sweeps=10,
+ fut_ts=6,
+ his_ts=2):
+ """Generate the train/val infos from the raw data.
+
+ Args:
+ nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+ train_scenes (list[str]): Basic information of training scenes.
+ val_scenes (list[str]): Basic information of validation scenes.
+ test (bool): Whether use the test mode. In the test mode, no
+ annotations can be accessed. Default: False.
+ max_sweeps (int): Max number of sweeps. Default: 10.
+
+ Returns:
+ tuple[list[dict]]: Information of training set and validation set
+ that will be saved to the info file.
+ """
+ train_nusc_infos = []
+ val_nusc_infos = []
+ frame_idx = 0
+ cat2idx = {}
+ for idx, dic in enumerate(nusc.category):
+ cat2idx[dic['name']] = idx
+
+ for sample in mmcv.track_iter_progress(nusc.sample):
+ map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location']
+ lidar_token = sample['data']['LIDAR_TOP']
+ sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ if sample['prev'] != '':
+ sample_prev = nusc.get('sample', sample['prev'])
+ sd_rec_prev = nusc.get('sample_data', sample_prev['data']['LIDAR_TOP'])
+ pose_record_prev = nusc.get('ego_pose', sd_rec_prev['ego_pose_token'])
+ else:
+ pose_record_prev = None
+ if sample['next'] != '':
+ sample_next = nusc.get('sample', sample['next'])
+ sd_rec_next = nusc.get('sample_data', sample_next['data']['LIDAR_TOP'])
+ pose_record_next = nusc.get('ego_pose', sd_rec_next['ego_pose_token'])
+ else:
+ pose_record_next = None
+
+ lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+ mmcv.check_file_exist(lidar_path)
+ can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+ fut_valid_flag = True
+ test_sample = copy.deepcopy(sample)
+ for i in range(fut_ts):
+ if test_sample['next'] != '':
+ test_sample = nusc.get('sample', test_sample['next'])
+ else:
+ fut_valid_flag = False
+ ##
+ info = {
+ 'lidar_path': lidar_path,
+ 'token': sample['token'],
+ 'prev': sample['prev'],
+ 'next': sample['next'],
+ 'can_bus': can_bus,
+ 'frame_idx': frame_idx, # temporal related info
+ 'sweeps': [],
+ 'cams': dict(),
+ 'scene_token': sample['scene_token'], # temporal related info
+ 'lidar2ego_translation': cs_record['translation'],
+ 'lidar2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sample['timestamp'],
+ 'fut_valid_flag': fut_valid_flag,
+ 'map_location': map_location
+ }
+
+ if sample['next'] == '':
+ frame_idx = 0
+ else:
+ frame_idx += 1
+
+ l2e_r = info['lidar2ego_rotation']
+ l2e_t = info['lidar2ego_translation']
+ e2g_r = info['ego2global_rotation']
+ e2g_t = info['ego2global_translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ # obtain 6 image's information per frame
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ for cam in camera_types:
+ cam_token = sample['data'][cam]
+ cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+ cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+ e2g_t, e2g_r_mat, cam)
+ cam_info.update(cam_intrinsic=cam_intrinsic)
+ info['cams'].update({cam: cam_info})
+
+ # obtain sweeps for a single key-frame
+ sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ sweeps = []
+ while len(sweeps) < max_sweeps:
+ if not sd_rec['prev'] == '':
+ sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+ l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+ sweeps.append(sweep)
+ sd_rec = nusc.get('sample_data', sd_rec['prev'])
+ else:
+ break
+ info['sweeps'] = sweeps
+ # obtain annotation
+ if not test:
+ annotations = [
+ nusc.get('sample_annotation', token)
+ for token in sample['anns']
+ ]
+ locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+ dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+ rots = np.array([b.orientation.yaw_pitch_roll[0]
+ for b in boxes]).reshape(-1, 1)
+ velocity = np.array(
+ [nusc.box_velocity(token)[:2] for token in sample['anns']])
+ valid_flag = np.array(
+ [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+ for anno in annotations],
+ dtype=bool).reshape(-1)
+ # convert velo from global to lidar
+ for i in range(len(boxes)):
+ velo = np.array([*velocity[i], 0.0])
+ velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+ l2e_r_mat).T
+ velocity[i] = velo[:2]
+
+ names = [b.name for b in boxes]
+ for i in range(len(names)):
+ if names[i] in NuScenesDataset.NameMapping:
+ names[i] = NuScenesDataset.NameMapping[names[i]]
+ names = np.array(names)
+ # we need to convert rot to SECOND format.
+ gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+ assert len(gt_boxes) == len(
+ annotations), f'{len(gt_boxes)}, {len(annotations)}'
+
+ # get future coords for each box
+ # [num_box, fut_ts*2]
+ num_box = len(boxes)
+ gt_fut_trajs = np.zeros((num_box, fut_ts, 2))
+ gt_fut_yaw = np.zeros((num_box, fut_ts))
+ gt_fut_masks = np.zeros((num_box, fut_ts))
+ gt_boxes_yaw = -(gt_boxes[:,6] + np.pi / 2)
+ # agent lcf feat (x, y, yaw, vx, vy, width, length, height, type)
+ agent_lcf_feat = np.zeros((num_box, 9))
+ gt_fut_goal = np.zeros((num_box))
+ for i, anno in enumerate(annotations):
+ cur_box = boxes[i]
+ cur_anno = anno
+ agent_lcf_feat[i, 0:2] = cur_box.center[:2]
+ agent_lcf_feat[i, 2] = gt_boxes_yaw[i]
+ agent_lcf_feat[i, 3:5] = velocity[i]
+ agent_lcf_feat[i, 5:8] = anno['size'] # width,length,height
+ agent_lcf_feat[i, 8] = cat2idx[anno['category_name']] if anno['category_name'] in cat2idx.keys() else -1
+ for j in range(fut_ts):
+ if cur_anno['next'] != '':
+ anno_next = nusc.get('sample_annotation', cur_anno['next'])
+ box_next = Box(
+ anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+ )
+ # Move box to ego vehicle coord system.
+ box_next.translate(-np.array(pose_record['translation']))
+ box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+ # Move box to sensor coord system.
+ box_next.translate(-np.array(cs_record['translation']))
+ box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+ gt_fut_trajs[i, j] = box_next.center[:2] - cur_box.center[:2]
+ gt_fut_masks[i, j] = 1
+ # add yaw diff
+ _, _, box_yaw = quart_to_rpy([cur_box.orientation.x, cur_box.orientation.y,
+ cur_box.orientation.z, cur_box.orientation.w])
+ _, _, box_yaw_next = quart_to_rpy([box_next.orientation.x, box_next.orientation.y,
+ box_next.orientation.z, box_next.orientation.w])
+ gt_fut_yaw[i, j] = box_yaw_next - box_yaw
+ cur_anno = anno_next
+ cur_box = box_next
+ else:
+ gt_fut_trajs[i, j:] = 0
+ break
+ # get agent goal
+ gt_fut_coords = np.cumsum(gt_fut_trajs[i], axis=-2)
+ coord_diff = gt_fut_coords[-1] - gt_fut_coords[0]
+ if coord_diff.max() < 1.0: # static
+ gt_fut_goal[i] = 9
+ else:
+ box_mot_yaw = np.arctan2(coord_diff[1], coord_diff[0]) + np.pi
+ gt_fut_goal[i] = box_mot_yaw // (np.pi / 4) # 0-8: goal direction class
+
+ # get ego history traj (offset format)
+ ego_his_trajs = np.zeros((his_ts+1, 3))
+ ego_his_trajs_diff = np.zeros((his_ts+1, 3))
+ sample_cur = sample
+ for i in range(his_ts, -1, -1):
+ if sample_cur is not None:
+ pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False)
+ ego_his_trajs[i] = pose_mat[:3, 3]
+ has_prev = sample_cur['prev'] != ''
+ has_next = sample_cur['next'] != ''
+ if has_next:
+ sample_next = nusc.get('sample', sample_cur['next'])
+ pose_mat_next = get_global_sensor_pose(sample_next, nusc, inverse=False)
+ ego_his_trajs_diff[i] = pose_mat_next[:3, 3] - ego_his_trajs[i]
+ sample_cur = nusc.get('sample', sample_cur['prev']) if has_prev else None
+ else:
+ ego_his_trajs[i] = ego_his_trajs[i+1] - ego_his_trajs_diff[i+1]
+ ego_his_trajs_diff[i] = ego_his_trajs_diff[i+1]
+
+ # global to ego at lcf
+ ego_his_trajs = ego_his_trajs - np.array(pose_record['translation'])
+ rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix
+ ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T
+ # ego to lidar at lcf
+ ego_his_trajs = ego_his_trajs - np.array(cs_record['translation'])
+ rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix
+ ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T
+ ego_his_trajs = ego_his_trajs[1:] - ego_his_trajs[:-1]
+
+ # get ego futute traj (offset format)
+ ego_fut_trajs = np.zeros((fut_ts+1, 3))
+ ego_fut_masks = np.zeros((fut_ts+1))
+ sample_cur = sample
+ for i in range(fut_ts+1):
+ pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False)
+ ego_fut_trajs[i] = pose_mat[:3, 3]
+ ego_fut_masks[i] = 1
+ if sample_cur['next'] == '':
+ ego_fut_trajs[i+1:] = ego_fut_trajs[i]
+ break
+ else:
+ sample_cur = nusc.get('sample', sample_cur['next'])
+ # global to ego at lcf
+ ego_fut_trajs = ego_fut_trajs - np.array(pose_record['translation'])
+ rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix
+ ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T
+ # ego to lidar at lcf
+ ego_fut_trajs = ego_fut_trajs - np.array(cs_record['translation'])
+ rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix
+ ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T
+ # drive command according to final fut step offset from lcf
+ if ego_fut_trajs[-1][0] >= 2:
+ command = np.array([1, 0, 0]) # Turn Right
+ elif ego_fut_trajs[-1][0] <= -2:
+ command = np.array([0, 1, 0]) # Turn Left
+ else:
+ command = np.array([0, 0, 1]) # Go Straight
+ # offset from lcf -> per-step offset
+ ego_fut_trajs = ego_fut_trajs[1:] - ego_fut_trajs[:-1]
+
+ ### ego lcf feat (vx, vy, ax, ay, w, length, width, vel, steer), w: yaw角速度
+ ego_lcf_feat = np.zeros(9)
+ # 根据odom推算自车速度及加速度
+ _, _, ego_yaw = quart_to_rpy(pose_record['rotation'])
+ ego_pos = np.array(pose_record['translation'])
+ if pose_record_prev is not None:
+ _, _, ego_yaw_prev = quart_to_rpy(pose_record_prev['rotation'])
+ ego_pos_prev = np.array(pose_record_prev['translation'])
+ if pose_record_next is not None:
+ _, _, ego_yaw_next = quart_to_rpy(pose_record_next['rotation'])
+ ego_pos_next = np.array(pose_record_next['translation'])
+ assert (pose_record_prev is not None) or (pose_record_next is not None), 'prev token and next token all empty'
+ if pose_record_prev is not None:
+ ego_w = (ego_yaw - ego_yaw_prev) / 0.5
+ ego_v = np.linalg.norm(ego_pos[:2] - ego_pos_prev[:2]) / 0.5
+ ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2)
+ else:
+ ego_w = (ego_yaw_next - ego_yaw) / 0.5
+ ego_v = np.linalg.norm(ego_pos_next[:2] - ego_pos[:2]) / 0.5
+ ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2)
+
+ ref_scene = nusc.get("scene", sample['scene_token'])
+ try:
+ pose_msgs = nusc_can_bus.get_messages(ref_scene['name'],'pose')
+ steer_msgs = nusc_can_bus.get_messages(ref_scene['name'], 'steeranglefeedback')
+ pose_uts = [msg['utime'] for msg in pose_msgs]
+ steer_uts = [msg['utime'] for msg in steer_msgs]
+ ref_utime = sample['timestamp']
+ pose_index = locate_message(pose_uts, ref_utime)
+ pose_data = pose_msgs[pose_index]
+ steer_index = locate_message(steer_uts, ref_utime)
+ steer_data = steer_msgs[steer_index]
+ # initial speed
+ v0 = pose_data["vel"][0] # [0] means longitudinal velocity m/s
+ # curvature (positive: turn left)
+ steering = steer_data["value"]
+ # flip x axis if in left-hand traffic (singapore)
+ flip_flag = True if map_location.startswith('singapore') else False
+ if flip_flag:
+ steering *= -1
+ Kappa = 2 * steering / 2.588
+ except:
+ delta_x = ego_his_trajs[-1, 0] + ego_fut_trajs[0, 0]
+ delta_y = ego_his_trajs[-1, 1] + ego_fut_trajs[0, 1]
+ v0 = np.sqrt(delta_x**2 + delta_y**2)
+ Kappa = 0
+
+ ego_lcf_feat[:2] = np.array([ego_vx, ego_vy]) #can_bus[13:15]
+ ego_lcf_feat[2:4] = can_bus[7:9]
+ ego_lcf_feat[4] = ego_w #can_bus[12]
+ ego_lcf_feat[5:7] = np.array([ego_length, ego_width])
+ ego_lcf_feat[7] = v0
+ ego_lcf_feat[8] = Kappa
+
+ info['gt_boxes'] = gt_boxes
+ info['gt_names'] = names
+ info['gt_velocity'] = velocity.reshape(-1, 2)
+ info['num_lidar_pts'] = np.array(
+ [a['num_lidar_pts'] for a in annotations])
+ info['num_radar_pts'] = np.array(
+ [a['num_radar_pts'] for a in annotations])
+ info['valid_flag'] = valid_flag
+ info['gt_agent_fut_trajs'] = gt_fut_trajs.reshape(-1, fut_ts*2).astype(np.float32)
+ info['gt_agent_fut_masks'] = gt_fut_masks.reshape(-1, fut_ts).astype(np.float32)
+ info['gt_agent_lcf_feat'] = agent_lcf_feat.astype(np.float32)
+ info['gt_agent_fut_yaw'] = gt_fut_yaw.astype(np.float32)
+ info['gt_agent_fut_goal'] = gt_fut_goal.astype(np.float32)
+ info['gt_ego_his_trajs'] = ego_his_trajs[:, :2].astype(np.float32)
+ info['gt_ego_fut_trajs'] = ego_fut_trajs[:, :2].astype(np.float32)
+ info['gt_ego_fut_masks'] = ego_fut_masks[1:].astype(np.float32)
+ info['gt_ego_fut_cmd'] = command.astype(np.float32)
+ info['gt_ego_lcf_feat'] = ego_lcf_feat.astype(np.float32)
+
+ if sample['scene_token'] in train_scenes:
+ train_nusc_infos.append(info)
+ else:
+ val_nusc_infos.append(info)
+
+ return train_nusc_infos, val_nusc_infos
+
+def get_global_sensor_pose(rec, nusc, inverse=False):
+ lidar_sample_data = nusc.get('sample_data', rec['data']['LIDAR_TOP'])
+
+ sd_ep = nusc.get("ego_pose", lidar_sample_data["ego_pose_token"])
+ sd_cs = nusc.get("calibrated_sensor", lidar_sample_data["calibrated_sensor_token"])
+ if inverse is False:
+ global_from_ego = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=False)
+ ego_from_sensor = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=False)
+ pose = global_from_ego.dot(ego_from_sensor)
+ # translation equivalent writing
+ # pose_translation = np.array(sd_cs["translation"])
+ # rot_mat = Quaternion(sd_ep['rotation']).rotation_matrix
+ # pose_translation = np.dot(rot_mat, pose_translation)
+ # # pose_translation = pose[:3, 3]
+ # pose_translation = pose_translation + np.array(sd_ep["translation"])
+ else:
+ sensor_from_ego = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=True)
+ ego_from_global = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=True)
+ pose = sensor_from_ego.dot(ego_from_global)
+ return pose
+
+def obtain_sensor2top(nusc,
+ sensor_token,
+ l2e_t,
+ l2e_r_mat,
+ e2g_t,
+ e2g_r_mat,
+ sensor_type='lidar'):
+ """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+ Args:
+ nusc (class): Dataset class in the nuScenes dataset.
+ sensor_token (str): Sample data token corresponding to the
+ specific sensor type.
+ l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+ l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+ in shape (3, 3).
+ e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+ e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+ in shape (3, 3).
+ sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+ Returns:
+ sweep (dict): Sweep information after transformation.
+ """
+ sd_rec = nusc.get('sample_data', sensor_token)
+ cs_record = nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+ if os.getcwd() in data_path: # path from lyftdataset is absolute path
+ data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path
+ sweep = {
+ 'data_path': data_path,
+ 'type': sensor_type,
+ 'sample_data_token': sd_rec['token'],
+ 'sensor2ego_translation': cs_record['translation'],
+ 'sensor2ego_rotation': cs_record['rotation'],
+ 'ego2global_translation': pose_record['translation'],
+ 'ego2global_rotation': pose_record['rotation'],
+ 'timestamp': sd_rec['timestamp']
+ }
+
+ l2e_r_s = sweep['sensor2ego_rotation']
+ l2e_t_s = sweep['sensor2ego_translation']
+ e2g_r_s = sweep['ego2global_rotation']
+ e2g_t_s = sweep['ego2global_translation']
+
+ # obtain the RT from sensor to Top LiDAR
+ # sweep->ego->global->ego'->lidar
+ l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+ e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+ R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+ ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+ sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T
+ sweep['sensor2lidar_translation'] = T
+ return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=False):
+ """Export 2d annotation from the info file and raw data.
+
+ Args:
+ root_path (str): Root path of the raw data.
+ info_path (str): Path of the info file.
+ version (str): Dataset version.
+ mono3d (bool): Whether to export mono3d annotation. Default: False.
+ """
+ # get bbox annotations for camera
+ camera_types = [
+ 'CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_FRONT_LEFT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_BACK_RIGHT',
+ ]
+ nusc_infos = mmcv.load(info_path)['infos']
+ nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+ # info_2d_list = []
+ cat2Ids = [
+ dict(id=nus_categories.index(cat_name), name=cat_name)
+ for cat_name in nus_categories
+ ]
+ coco_ann_id = 0
+ coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+ for info in mmcv.track_iter_progress(nusc_infos):
+ for cam in camera_types:
+ cam_info = info['cams'][cam]
+ coco_infos = get_2d_boxes(
+ nusc,
+ cam_info['sample_data_token'],
+ visibilities=['', '1', '2', '3', '4'],
+ mono3d=mono3d)
+ (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+ coco_2d_dict['images'].append(
+ dict(
+ file_name=cam_info['data_path'].split('data/nuscenes/')
+ [-1],
+ id=cam_info['sample_data_token'],
+ token=info['token'],
+ cam2ego_rotation=cam_info['sensor2ego_rotation'],
+ cam2ego_translation=cam_info['sensor2ego_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ ego2global_translation=info['ego2global_translation'],
+ cam_intrinsic=cam_info['cam_intrinsic'],
+ width=width,
+ height=height))
+ for coco_info in coco_infos:
+ if coco_info is None:
+ continue
+ # add an empty key for coco format
+ coco_info['segmentation'] = []
+ coco_info['id'] = coco_ann_id
+ coco_2d_dict['annotations'].append(coco_info)
+ coco_ann_id += 1
+ if mono3d:
+ json_prefix = f'{info_path[:-4]}_mono3d'
+ else:
+ json_prefix = f'{info_path[:-4]}'
+ mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+ sample_data_token: str,
+ visibilities: List[str],
+ mono3d=True):
+ """Get the 2D annotation records for a given `sample_data_token`.
+
+ Args:
+ sample_data_token (str): Sample data token belonging to a camera \
+ keyframe.
+ visibilities (list[str]): Visibility filter.
+ mono3d (bool): Whether to get boxes with mono3d annotation.
+
+ Return:
+ list[dict]: List of 2D annotation record that belongs to the input
+ `sample_data_token`.
+ """
+
+ # Get the sample data and the sample corresponding to that sample data.
+ sd_rec = nusc.get('sample_data', sample_data_token)
+
+ assert sd_rec[
+ 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+ ' for camera sample_data!'
+ if not sd_rec['is_key_frame']:
+ raise ValueError(
+ 'The 2D re-projections are available only for keyframes.')
+
+ s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+ # Get the calibrated sensor and ego pose
+ # record to get the transformation matrices.
+ cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+ pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+ camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+ # Get all the annotation with the specified visibilties.
+ ann_recs = [
+ nusc.get('sample_annotation', token) for token in s_rec['anns']
+ ]
+ ann_recs = [
+ ann_rec for ann_rec in ann_recs
+ if (ann_rec['visibility_token'] in visibilities)
+ ]
+
+ repro_recs = []
+
+ for ann_rec in ann_recs:
+ # Augment sample_annotation with token information.
+ ann_rec['sample_annotation_token'] = ann_rec['token']
+ ann_rec['sample_data_token'] = sample_data_token
+
+ # Get the box in global coordinates.
+ box = nusc.get_box(ann_rec['token'])
+
+ # Move them to the ego-pose frame.
+ box.translate(-np.array(pose_rec['translation']))
+ box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+ # Move them to the calibrated sensor frame.
+ box.translate(-np.array(cs_rec['translation']))
+ box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+ # Filter out the corners that are not in front of the calibrated
+ # sensor.
+ corners_3d = box.corners()
+ in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+ corners_3d = corners_3d[:, in_front]
+
+ # Project 3d box to 2d.
+ corner_coords = view_points(corners_3d, camera_intrinsic,
+ True).T[:, :2].tolist()
+
+ # Keep only corners that fall within the image.
+ final_coords = post_process_coords(corner_coords)
+
+ # Skip if the convex hull of the re-projected corners
+ # does not intersect the image canvas.
+ if final_coords is None:
+ continue
+ else:
+ min_x, min_y, max_x, max_y = final_coords
+
+ # Generate dictionary record to be included in the .json file.
+ repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+ sample_data_token, sd_rec['filename'])
+
+ # If mono3d=True, add 3D annotations in camera coordinates
+ if mono3d and (repro_rec is not None):
+ loc = box.center.tolist()
+
+ dim = box.wlh
+ dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw
+ dim = dim.tolist()
+
+ rot = box.orientation.yaw_pitch_roll[0]
+ rot = [-rot] # convert the rot to our cam coordinate
+
+ global_velo2d = nusc.box_velocity(box.token)[:2]
+ global_velo3d = np.array([*global_velo2d, 0.0])
+ e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+ c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+ cam_velo3d = global_velo3d @ np.linalg.inv(
+ e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+ velo = cam_velo3d[0::2].tolist()
+
+ repro_rec['bbox_cam3d'] = loc + dim + rot
+ repro_rec['velo_cam3d'] = velo
+
+ center3d = np.array(loc).reshape([1, 3])
+ center2d = points_cam2img(
+ center3d, camera_intrinsic, with_depth=True)
+ repro_rec['center2d'] = center2d.squeeze().tolist()
+ # normalized center2D + depth
+ # if samples with depth < 0 will be removed
+ if repro_rec['center2d'][2] <= 0:
+ continue
+
+ ann_token = nusc.get('sample_annotation',
+ box.token)['attribute_tokens']
+ if len(ann_token) == 0:
+ attr_name = 'None'
+ else:
+ attr_name = nusc.get('attribute', ann_token[0])['name']
+ attr_id = nus_attributes.index(attr_name)
+ repro_rec['attribute_name'] = attr_name
+ repro_rec['attribute_id'] = attr_id
+
+ repro_recs.append(repro_rec)
+
+ return repro_recs
+
+
+def post_process_coords(
+ corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+ """Get the intersection of the convex hull of the reprojected bbox corners
+ and the image canvas, return None if no intersection.
+
+ Args:
+ corner_coords (list[int]): Corner coordinates of reprojected
+ bounding box.
+ imsize (tuple[int]): Size of the image canvas.
+
+ Return:
+ tuple [float]: Intersection of the convex hull of the 2D box
+ corners and the image canvas.
+ """
+ polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+ img_canvas = box(0, 0, imsize[0], imsize[1])
+
+ if polygon_from_2d_box.intersects(img_canvas):
+ img_intersection = polygon_from_2d_box.intersection(img_canvas)
+ intersection_coords = np.array(
+ [coord for coord in img_intersection.exterior.coords])
+
+ min_x = min(intersection_coords[:, 0])
+ min_y = min(intersection_coords[:, 1])
+ max_x = max(intersection_coords[:, 0])
+ max_y = max(intersection_coords[:, 1])
+
+ return min_x, min_y, max_x, max_y
+ else:
+ return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+ sample_data_token: str, filename: str) -> OrderedDict:
+ """Generate one 2D annotation record given various informations on top of
+ the 2D bounding box coordinates.
+
+ Args:
+ ann_rec (dict): Original 3d annotation record.
+ x1 (float): Minimum value of the x coordinate.
+ y1 (float): Minimum value of the y coordinate.
+ x2 (float): Maximum value of the x coordinate.
+ y2 (float): Maximum value of the y coordinate.
+ sample_data_token (str): Sample data token.
+ filename (str):The corresponding image file where the annotation
+ is present.
+
+ Returns:
+ dict: A sample 2D annotation record.
+ - file_name (str): flie name
+ - image_id (str): sample data token
+ - area (float): 2d box area
+ - category_name (str): category name
+ - category_id (int): category id
+ - bbox (list[float]): left x, top y, dx, dy of 2d box
+ - iscrowd (int): whether the area is crowd
+ """
+ repro_rec = OrderedDict()
+ repro_rec['sample_data_token'] = sample_data_token
+ coco_rec = dict()
+
+ relevant_keys = [
+ 'attribute_tokens',
+ 'category_name',
+ 'instance_token',
+ 'next',
+ 'num_lidar_pts',
+ 'num_radar_pts',
+ 'prev',
+ 'sample_annotation_token',
+ 'sample_data_token',
+ 'visibility_token',
+ ]
+
+ for key, value in ann_rec.items():
+ if key in relevant_keys:
+ repro_rec[key] = value
+
+ repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+ repro_rec['filename'] = filename
+
+ coco_rec['file_name'] = filename
+ coco_rec['image_id'] = sample_data_token
+ coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+ if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+ return None
+ cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+ coco_rec['category_name'] = cat_name
+ coco_rec['category_id'] = nus_categories.index(cat_name)
+ coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+ coco_rec['iscrowd'] = 0
+
+ return coco_rec
+
+
+def nuscenes_data_prep(root_path,
+ can_bus_root_path,
+ info_prefix,
+ version,
+ dataset_name,
+ out_dir,
+ max_sweeps=10):
+ """Prepare data related to nuScenes dataset.
+
+ Related data consists of '.pkl' files recording basic infos,
+ 2D annotations and groundtruth database.
+
+ Args:
+ root_path (str): Path of dataset root.
+ info_prefix (str): The prefix of info filenames.
+ version (str): Dataset version.
+ dataset_name (str): The dataset class name.
+ out_dir (str): Output directory of the groundtruth database info.
+ max_sweeps (int): Number of input consecutive frames. Default: 10
+ """
+ create_nuscenes_infos(
+ root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+ '--root-path',
+ type=str,
+ default='./data/kitti',
+ help='specify the root path of dataset')
+parser.add_argument(
+ '--canbus',
+ type=str,
+ default='./data',
+ help='specify the root path of nuScenes canbus')
+parser.add_argument(
+ '--version',
+ type=str,
+ default='v1.0',
+ required=False,
+ help='specify the dataset version, no need for kitti')
+parser.add_argument(
+ '--max-sweeps',
+ type=int,
+ default=10,
+ required=False,
+ help='specify sweeps of lidar per example')
+parser.add_argument(
+ '--out-dir',
+ type=str,
+ default='./data/kitti',
+ required='False',
+ help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+ '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+ if args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+ train_version = f'{args.version}-trainval'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ test_version = f'{args.version}-test'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=test_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
+ elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+ train_version = f'{args.version}'
+ nuscenes_data_prep(
+ root_path=args.root_path,
+ can_bus_root_path=args.canbus,
+ info_prefix=args.extra_tag,
+ version=train_version,
+ dataset_name='NuScenesDataset',
+ out_dir=args.out_dir,
+ max_sweeps=args.max_sweeps)
diff --git a/adzoo/vad/dist_test.sh b/adzoo/vad/dist_test.sh
new file mode 100755
index 0000000..3e2ec30
--- /dev/null
+++ b/adzoo/vad/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29503}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
diff --git a/adzoo/vad/dist_train.sh b/adzoo/vad/dist_train.sh
new file mode 100755
index 0000000..141b284
--- /dev/null
+++ b/adzoo/vad/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/adzoo/vad/misc/browse_dataset.py b/adzoo/vad/misc/browse_dataset.py
new file mode 100644
index 0000000..e3419f6
--- /dev/null
+++ b/adzoo/vad/misc/browse_dataset.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+ DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+ show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Browse a dataset')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument(
+ '--skip-type',
+ type=str,
+ nargs='+',
+ default=['Normalize'],
+ help='skip some useless pipeline')
+ parser.add_argument(
+ '--output-dir',
+ default=None,
+ type=str,
+ help='If there is no display interface, you can save it')
+ parser.add_argument(
+ '--task',
+ type=str,
+ choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+ help='Determine the visualization method depending on the task.')
+ parser.add_argument(
+ '--online',
+ action='store_true',
+ help='Whether to perform online visualization. Note that you often '
+ 'need a monitor to do so.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ args = parser.parse_args()
+ return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+ """Build data config for loading visualization data."""
+ cfg = Config.fromfile(config_path)
+ if cfg_options is not None:
+ cfg.merge_from_dict(cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+ # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+ # so we don't need to worry about it later
+ if cfg.data.train['type'] == 'RepeatDataset':
+ cfg.data.train = cfg.data.train.dataset
+ # use only first dataset for `ConcatDataset`
+ if cfg.data.train['type'] == 'ConcatDataset':
+ cfg.data.train = cfg.data.train.datasets[0]
+ train_data_cfg = cfg.data.train
+ # eval_pipeline purely consists of loading functions
+ # use eval_pipeline for data loading
+ train_data_cfg['pipeline'] = [
+ x for x in cfg.eval_pipeline if x['type'] not in skip_type
+ ]
+
+ return cfg
+
+
+def to_depth_mode(points, bboxes):
+ """Convert points and bboxes to Depth Coord and Depth Box mode."""
+ if points is not None:
+ points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+ Coord3DMode.DEPTH)
+ if bboxes is not None:
+ bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+ Box3DMode.DEPTH)
+ return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+ """Visualize 3D point cloud and 3D bboxes."""
+ example = dataset.prepare_train_data(idx)
+ points = example['points']._data.numpy()
+ gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+ if dataset.box_mode_3d != Box3DMode.DEPTH:
+ points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+ show_result(
+ points,
+ gt_bboxes.clone(),
+ None,
+ out_dir,
+ filename,
+ show=show,
+ snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+ """Visualize 3D point cloud and segmentation mask."""
+ example = dataset.prepare_train_data(idx)
+ points = example['points']._data.numpy()
+ gt_seg = example['pts_semantic_mask']._data.numpy()
+ show_seg_result(
+ points,
+ gt_seg.copy(),
+ None,
+ out_dir,
+ filename,
+ np.array(dataset.PALETTE),
+ dataset.ignore_index,
+ show=show,
+ snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+ dataset,
+ out_dir,
+ filename,
+ show=False,
+ is_nus_mono=False):
+ """Visualize 3D bboxes on 2D image by projection."""
+ try:
+ example = dataset.prepare_train_data(idx)
+ except AttributeError: # for Mono-3D datasets
+ example = dataset.prepare_train_img(idx)
+ gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+ img_metas = example['img_metas']._data
+ img = example['img']._data.numpy()
+ # need to transpose channel to first dim
+ img = img.transpose(1, 2, 0)
+ # no 3D gt bboxes, just show img
+ if gt_bboxes.tensor.shape[0] == 0:
+ gt_bboxes = None
+ if isinstance(gt_bboxes, DepthInstance3DBoxes):
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ None,
+ None,
+ out_dir,
+ filename,
+ box_mode='depth',
+ img_metas=img_metas,
+ show=show)
+ elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ None,
+ img_metas['lidar2img'],
+ out_dir,
+ filename,
+ box_mode='lidar',
+ img_metas=img_metas,
+ show=show)
+ elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ None,
+ img_metas['cam2img'],
+ out_dir,
+ filename,
+ box_mode='camera',
+ img_metas=img_metas,
+ show=show)
+ else:
+ # can't project, just show img
+ warnings.warn(
+ f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+ show_multi_modality_result(
+ img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+ args = parse_args()
+
+ if args.output_dir is not None:
+ mkdir_or_exist(args.output_dir)
+
+ cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+ try:
+ dataset = build_dataset(
+ cfg.data.train, default_args=dict(filter_empty_gt=False))
+ except TypeError: # seg dataset doesn't have `filter_empty_gt` key
+ dataset = build_dataset(cfg.data.train)
+ data_infos = dataset.data_infos
+ dataset_type = cfg.dataset_type
+
+ # configure visualization mode
+ vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+ for idx, data_info in enumerate(track_iter_progress(data_infos)):
+ if dataset_type in ['KittiDataset', 'WaymoDataset']:
+ data_path = data_info['point_cloud']['velodyne_path']
+ elif dataset_type in [
+ 'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+ 'S3DISSegDataset', 'S3DISDataset'
+ ]:
+ data_path = data_info['pts_path']
+ elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+ data_path = data_info['lidar_path']
+ elif dataset_type in ['NuScenesMonoDataset']:
+ data_path = data_info['file_name']
+ else:
+ raise NotImplementedError(
+ f'unsupported dataset type {dataset_type}')
+
+ file_name = osp.splitext(osp.basename(data_path))[0]
+
+ if vis_task in ['det', 'multi_modality-det']:
+ # show 3D bboxes on 3D point clouds
+ show_det_data(
+ idx, dataset, args.output_dir, file_name, show=args.online)
+ if vis_task in ['multi_modality-det', 'mono-det']:
+ # project 3D bboxes to 2D image
+ show_proj_bbox_img(
+ idx,
+ dataset,
+ args.output_dir,
+ file_name,
+ show=args.online,
+ is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+ elif vis_task in ['seg']:
+ # show 3D segmentation mask on 3D point clouds
+ show_seg_data(
+ idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/misc/fuse_conv_bn.py b/adzoo/vad/misc/fuse_conv_bn.py
new file mode 100644
index 0000000..d4e2201
--- /dev/null
+++ b/adzoo/vad/misc/fuse_conv_bn.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from mmcv.runner import save_checkpoint
+from torch import nn as nn
+
+from mmdet.apis import init_model
+
+
+def fuse_conv_bn(conv, bn):
+ """During inference, the functionary of batch norm layers is turned off but
+ only the mean and var alone channels are used, which exposes the chance to
+ fuse it with the preceding conv layers to save computations and simplify
+ network structures."""
+ conv_w = conv.weight
+ conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+ bn.running_mean)
+
+ factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+ conv.weight = nn.Parameter(conv_w *
+ factor.reshape([conv.out_channels, 1, 1, 1]))
+ conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+ return conv
+
+
+def fuse_module(m):
+ last_conv = None
+ last_conv_name = None
+
+ for name, child in m.named_children():
+ if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+ if last_conv is None: # only fuse BN that is after Conv
+ continue
+ fused_conv = fuse_conv_bn(last_conv, child)
+ m._modules[last_conv_name] = fused_conv
+ # To reduce changes, set BN as Identity instead of deleting it.
+ m._modules[name] = nn.Identity()
+ last_conv = None
+ elif isinstance(child, nn.Conv2d):
+ last_conv = child
+ last_conv_name = name
+ else:
+ fuse_module(child)
+ return m
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='fuse Conv and BN layers in a model')
+ parser.add_argument('config', help='config file path')
+ parser.add_argument('checkpoint', help='checkpoint file path')
+ parser.add_argument('out', help='output path of the converted model')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+ # build the model from a config file and a checkpoint file
+ model = init_model(args.config, args.checkpoint)
+ # fuse conv and bn layers of the model
+ fused_model = fuse_module(model)
+ save_checkpoint(fused_model, args.out)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/misc/print_config.py b/adzoo/vad/misc/print_config.py
new file mode 100644
index 0000000..3100fc3
--- /dev/null
+++ b/adzoo/vad/misc/print_config.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Print the whole config')
+ parser.add_argument('config', help='config file path')
+ parser.add_argument(
+ '--options', nargs='+', action=DictAction, help='arguments in dict')
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ if args.options is not None:
+ cfg.merge_from_dict(args.options)
+ print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/misc/visualize_results.py b/adzoo/vad/misc/visualize_results.py
new file mode 100644
index 0000000..302adc5
--- /dev/null
+++ b/adzoo/vad/misc/visualize_results.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+from mmcv import Config
+
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet3D visualize the results')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('--result', help='results file in pickle format')
+ parser.add_argument(
+ '--show-dir', help='directory where visualize results will be saved')
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ if args.result is not None and \
+ not args.result.endswith(('.pkl', '.pickle')):
+ raise ValueError('The results file must be a pkl file.')
+
+ cfg = Config.fromfile(args.config)
+ cfg.data.test.test_mode = True
+
+ # build the dataset
+ dataset = build_dataset(cfg.data.test)
+ results = mmcv.load(args.result)
+
+ if getattr(dataset, 'show', None) is not None:
+ # data loading pipeline for showing
+ eval_pipeline = cfg.get('eval_pipeline', {})
+ if eval_pipeline:
+ dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+ else:
+ dataset.show(results, args.show_dir) # use default pipeline
+ else:
+ raise NotImplementedError(
+ 'Show is not implemented for dataset {}!'.format(
+ type(dataset).__name__))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/model_converters/convert_votenet_checkpoints.py b/adzoo/vad/model_converters/convert_votenet_checkpoints.py
new file mode 100644
index 0000000..33792b0
--- /dev/null
+++ b/adzoo/vad/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+import torch
+from mmcv import Config
+from mmcv.runner import load_state_dict
+
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+ parser.add_argument('checkpoint', help='checkpoint file')
+ parser.add_argument('--out', help='path of the output checkpoint file')
+ args = parser.parse_args()
+ return args
+
+
+def parse_config(config_strings):
+ """Parse config from strings.
+
+ Args:
+ config_strings (string): strings of model config.
+
+ Returns:
+ Config: model config
+ """
+ temp_file = tempfile.NamedTemporaryFile()
+ config_path = f'{temp_file.name}.py'
+ with open(config_path, 'w') as f:
+ f.write(config_strings)
+
+ config = Config.fromfile(config_path)
+
+ # Update backbone config
+ if 'pool_mod' in config.model.backbone:
+ config.model.backbone.pop('pool_mod')
+
+ if 'sa_cfg' not in config.model.backbone:
+ config.model.backbone['sa_cfg'] = dict(
+ type='PointSAModule',
+ pool_mod='max',
+ use_xyz=True,
+ normalize_xyz=True)
+
+ if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+ config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+ # Update bbox_head config
+ if 'pred_layer_cfg' not in config.model.bbox_head:
+ config.model.bbox_head['pred_layer_cfg'] = dict(
+ in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+ if 'feat_channels' in config.model.bbox_head:
+ config.model.bbox_head.pop('feat_channels')
+
+ if 'vote_moudule_cfg' in config.model.bbox_head:
+ config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+ 'vote_moudule_cfg')
+
+ if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+ config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+ temp_file.close()
+
+ return config
+
+
+def main():
+ """Convert keys in checkpoints for VoteNet.
+
+ There can be some breaking changes during the development of mmdetection3d,
+ and this tool is used for upgrading checkpoints trained with old versions
+ (before v0.6.0) to the latest one.
+ """
+ args = parse_args()
+ checkpoint = torch.load(args.checkpoint)
+ cfg = parse_config(checkpoint['meta']['config'])
+ # Build the model and load checkpoint
+ model = build_detector(
+ cfg.model,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+ orig_ckpt = checkpoint['state_dict']
+ converted_ckpt = orig_ckpt.copy()
+
+ if cfg['dataset_type'] == 'ScanNetDataset':
+ NUM_CLASSES = 18
+ elif cfg['dataset_type'] == 'SUNRGBDDataset':
+ NUM_CLASSES = 10
+ else:
+ raise NotImplementedError
+
+ RENAME_PREFIX = {
+ 'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+ 'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+ }
+
+ DEL_KEYS = [
+ 'bbox_head.conv_pred.0.bn.num_batches_tracked',
+ 'bbox_head.conv_pred.1.bn.num_batches_tracked'
+ ]
+
+ EXTRACT_KEYS = {
+ 'bbox_head.conv_pred.conv_cls.weight':
+ ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+ 'bbox_head.conv_pred.conv_cls.bias':
+ ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+ 'bbox_head.conv_pred.conv_reg.weight':
+ ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+ 'bbox_head.conv_pred.conv_reg.bias':
+ ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+ }
+
+ # Delete some useless keys
+ for key in DEL_KEYS:
+ converted_ckpt.pop(key)
+
+ # Rename keys with specific prefix
+ RENAME_KEYS = dict()
+ for old_key in converted_ckpt.keys():
+ for rename_prefix in RENAME_PREFIX.keys():
+ if rename_prefix in old_key:
+ new_key = old_key.replace(rename_prefix,
+ RENAME_PREFIX[rename_prefix])
+ RENAME_KEYS[new_key] = old_key
+ for new_key, old_key in RENAME_KEYS.items():
+ converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+ # Extract weights and rename the keys
+ for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+ cur_layers = orig_ckpt[old_key]
+ converted_layers = []
+ for (start, end) in indices:
+ if end != -1:
+ converted_layers.append(cur_layers[start:end])
+ else:
+ converted_layers.append(cur_layers[start:])
+ converted_layers = torch.cat(converted_layers, 0)
+ converted_ckpt[new_key] = converted_layers
+ if old_key in converted_ckpt.keys():
+ converted_ckpt.pop(old_key)
+
+ # Check the converted checkpoint by loading to the model
+ load_state_dict(model, converted_ckpt, strict=True)
+ checkpoint['state_dict'] = converted_ckpt
+ torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/model_converters/publish_model.py b/adzoo/vad/model_converters/publish_model.py
new file mode 100644
index 0000000..318fd46
--- /dev/null
+++ b/adzoo/vad/model_converters/publish_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+import torch
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Process a checkpoint to be published')
+ parser.add_argument('in_file', help='input checkpoint filename')
+ parser.add_argument('out_file', help='output checkpoint filename')
+ args = parser.parse_args()
+ return args
+
+
+def process_checkpoint(in_file, out_file):
+ checkpoint = torch.load(in_file, map_location='cpu')
+ # remove optimizer for smaller file size
+ if 'optimizer' in checkpoint:
+ del checkpoint['optimizer']
+ # if it is necessary to remove some sensitive data in checkpoint['meta'],
+ # add the code here.
+ torch.save(checkpoint, out_file)
+ sha = subprocess.check_output(['sha256sum', out_file]).decode()
+ final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+ subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+ args = parse_args()
+ process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/model_converters/regnet2mmdet.py b/adzoo/vad/model_converters/regnet2mmdet.py
new file mode 100644
index 0000000..9dee3c8
--- /dev/null
+++ b/adzoo/vad/model_converters/regnet2mmdet.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from collections import OrderedDict
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+ new_key = model_key.replace('stem.conv', 'conv1')
+ new_key = new_key.replace('stem.bn', 'bn1')
+ state_dict[new_key] = model_weight
+ converted_names.add(model_key)
+ print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+ new_key = model_key.replace('head.fc', 'fc')
+ state_dict[new_key] = model_weight
+ converted_names.add(model_key)
+ print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+ split_keys = model_key.split('.')
+ layer, block, module = split_keys[:3]
+ block_id = int(block[1:])
+ layer_name = f'layer{int(layer[1:])}'
+ block_name = f'{block_id - 1}'
+
+ if block_id == 1 and module == 'bn':
+ new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+ elif block_id == 1 and module == 'proj':
+ new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+ elif module == 'f':
+ if split_keys[3] == 'a_bn':
+ module_name = 'bn1'
+ elif split_keys[3] == 'b_bn':
+ module_name = 'bn2'
+ elif split_keys[3] == 'c_bn':
+ module_name = 'bn3'
+ elif split_keys[3] == 'a':
+ module_name = 'conv1'
+ elif split_keys[3] == 'b':
+ module_name = 'conv2'
+ elif split_keys[3] == 'c':
+ module_name = 'conv3'
+ new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+ else:
+ raise ValueError(f'Unsupported conversion of key {model_key}')
+ print(f'Convert {model_key} to {new_key}')
+ state_dict[new_key] = model_weight
+ converted_names.add(model_key)
+
+
+def convert(src, dst):
+ """Convert keys in pycls pretrained RegNet models to mmdet style."""
+ # load caffe model
+ regnet_model = torch.load(src)
+ blobs = regnet_model['model_state']
+ # convert to pytorch style
+ state_dict = OrderedDict()
+ converted_names = set()
+ for key, weight in blobs.items():
+ if 'stem' in key:
+ convert_stem(key, weight, state_dict, converted_names)
+ elif 'head' in key:
+ convert_head(key, weight, state_dict, converted_names)
+ elif key.startswith('s'):
+ convert_reslayer(key, weight, state_dict, converted_names)
+
+ # check if all layers are converted
+ for key in blobs:
+ if key not in converted_names:
+ print(f'not converted: {key}')
+ # save checkpoint
+ checkpoint = dict()
+ checkpoint['state_dict'] = state_dict
+ torch.save(checkpoint, dst)
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Convert model keys')
+ parser.add_argument('src', help='src detectron model path')
+ parser.add_argument('dst', help='save path')
+ args = parser.parse_args()
+ convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/adzoo/vad/test.py b/adzoo/vad/test.py
new file mode 100644
index 0000000..1733443
--- /dev/null
+++ b/adzoo/vad/test.py
@@ -0,0 +1,277 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+import argparse
+import os
+import torch
+import warnings
+from mmcv.utils import get_dist_info, init_dist, wrap_fp16_model, set_random_seed, Config, DictAction, load_checkpoint
+from mmcv.models import build_model, fuse_conv_bn
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from mmcv.datasets import build_dataset, build_dataloader, replace_ImageToTensor
+import time
+import os.path as osp
+from adzoo.vad.apis.test import custom_multi_gpu_test, single_gpu_test
+
+import warnings
+warnings.filterwarnings("ignore")
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='MMDet test (and eval) a model')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument('checkpoint', help='checkpoint file')
+ parser.add_argument('--json_dir', help='json parent dir name file') # NOTE: json file parent folder name
+ parser.add_argument('--out', help='output result file in pickle format')
+ parser.add_argument(
+ '--fuse-conv-bn',
+ action='store_true',
+ help='Whether to fuse conv and bn, this will slightly increase'
+ 'the inference speed')
+ parser.add_argument(
+ '--format-only',
+ action='store_true',
+ help='Format the output results without perform evaluation. It is'
+ 'useful when you want to format the result to a specific format and '
+ 'submit it to the test server')
+ parser.add_argument(
+ '--eval',
+ type=str,
+ nargs='+',
+ help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+ ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+ parser.add_argument('--show', action='store_true', help='show results')
+ parser.add_argument(
+ '--show-dir', help='directory where results will be saved')
+ parser.add_argument(
+ '--gpu-collect',
+ action='store_true',
+ help='whether to use gpu to collect results.')
+ parser.add_argument(
+ '--tmpdir',
+ help='tmp directory used for collecting results from multiple '
+ 'workers, available when gpu-collect is not specified')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ parser.add_argument(
+ '--options',
+ nargs='+',
+ action=DictAction,
+ help='custom options for evaluation, the key-value pair in xxx=yyy '
+ 'format will be kwargs for dataset.evaluate() function (deprecate), '
+ 'change to --eval-options instead.')
+ parser.add_argument(
+ '--eval-options',
+ nargs='+',
+ action=DictAction,
+ help='custom options for evaluation, the key-value pair in xxx=yyy '
+ 'format will be kwargs for dataset.evaluate() function')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local-rank', type=int, default=0)
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ if args.options and args.eval_options:
+ raise ValueError(
+ '--options and --eval-options cannot be both specified, '
+ '--options is deprecated in favor of --eval-options')
+ if args.options:
+ warnings.warn('--options is deprecated in favor of --eval-options')
+ args.eval_options = args.options
+ return args
+
+
+def main():
+ args = parse_args()
+
+ assert args.out or args.eval or args.format_only or args.show \
+ or args.show_dir, \
+ ('Please specify at least one operation (save/eval/format/show the '
+ 'results / save the results) with the argument "--out", "--eval"'
+ ', "--format-only", "--show" or "--show-dir"')
+
+ if args.eval and args.format_only:
+ raise ValueError('--eval and --format_only cannot be both specified')
+
+ if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+ raise ValueError('The output file must be a pkl file.')
+
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+
+ # import modules from plguin/xx, registry will be updated
+ # if hasattr(cfg, 'plugin'):
+ # if cfg.plugin:
+ # import importlib
+ # if hasattr(cfg, 'plugin_dir'):
+ # plugin_dir = cfg.plugin_dir
+ # _module_dir = os.path.dirname(plugin_dir)
+ # _module_dir = _module_dir.split('/')
+ # _module_path = _module_dir[0]
+
+ # for m in _module_dir[1:]:
+ # _module_path = _module_path + '.' + m
+ # print(_module_path)
+ # plg_lib = importlib.import_module(_module_path)
+ # else:
+ # # import dir is the dirpath for the config file
+ # _module_dir = os.path.dirname(args.config)
+ # _module_dir = _module_dir.split('/')
+ # _module_path = _module_dir[0]
+ # for m in _module_dir[1:]:
+ # _module_path = _module_path + '.' + m
+ # print(_module_path)
+ # plg_lib = importlib.import_module(_module_path)
+
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+
+ if cfg.get('close_tf32', False):
+ torch.backends.cuda.matmul.allow_tf32 = False
+ torch.backends.cudnn.allow_tf32 = False
+
+ cfg.model.pretrained = None
+ # in case the test dataset is concatenated
+ samples_per_gpu = 1
+ if isinstance(cfg.data.test, dict):
+ cfg.data.test.test_mode = True
+ samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+ if samples_per_gpu > 1:
+ # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+ cfg.data.test.pipeline = replace_ImageToTensor(
+ cfg.data.test.pipeline)
+ elif isinstance(cfg.data.test, list):
+ for ds_cfg in cfg.data.test:
+ ds_cfg.test_mode = True
+ samples_per_gpu = max(
+ [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+ if samples_per_gpu > 1:
+ for ds_cfg in cfg.data.test:
+ ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ else:
+ distributed = True
+ init_dist(args.launcher, **cfg.dist_params)
+
+ # set random seeds
+ if args.seed is not None:
+ set_random_seed(args.seed, deterministic=args.deterministic)
+
+ # build the dataloader
+ dataset = build_dataset(cfg.data.test)
+ data_loader = build_dataloader(
+ dataset,
+ samples_per_gpu=samples_per_gpu,
+ workers_per_gpu=cfg.data.workers_per_gpu,
+ dist=distributed,
+ shuffle=False,
+ nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+ )
+
+ # build the model and load checkpoint
+ cfg.model.train_cfg = None
+ model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+ fp16_cfg = cfg.get('fp16', None)
+ if fp16_cfg is not None:
+ wrap_fp16_model(model)
+ checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+ if args.fuse_conv_bn:
+ model = fuse_conv_bn(model)
+ # old versions did not save class info in checkpoints, this walkaround is
+ # for backward compatibility
+ if 'CLASSES' in checkpoint.get('meta', {}):
+ model.CLASSES = checkpoint['meta']['CLASSES']
+ else:
+ model.CLASSES = dataset.CLASSES
+ # palette for visualization in segmentation tasks
+ if 'PALETTE' in checkpoint.get('meta', {}):
+ model.PALETTE = checkpoint['meta']['PALETTE']
+ elif hasattr(dataset, 'PALETTE'):
+ # segmentation dataset has `PALETTE` attribute
+ model.PALETTE = dataset.PALETTE
+
+ if not distributed:
+ model = DataParallel(model, device_ids=[0])
+ outputs = single_gpu_test(model, data_loader)
+ else:
+ model = DistributedDataParallel(
+ model.cuda(),
+ device_ids=[torch.cuda.current_device()],
+ broadcast_buffers=False)
+ outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
+ args.gpu_collect)
+
+
+
+ rank, _ = get_dist_info()
+ if rank == 0:
+ if args.out:
+ print(f'\nwriting results to {args.out}')
+ kwargs = {} if args.eval_options is None else args.eval_options
+ kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
+ '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+ if args.format_only:
+ dataset.format_results(outputs, **kwargs)
+
+ if args.eval:
+ eval_kwargs = cfg.get('evaluation', {}).copy()
+ # hard-code way to remove EvalHook args
+ for key in [
+ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+ 'rule'
+ ]:
+ eval_kwargs.pop(key, None)
+ eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+ print(dataset.evaluate(outputs['bbox_results'], **eval_kwargs))
+
+ # # # NOTE: record to json
+ # json_path = args.json_dir
+ # if not os.path.exists(json_path):
+ # os.makedirs(json_path)
+
+ # metric_all = []
+ # for res in outputs['bbox_results']:
+ # for k in res['metric_results'].keys():
+ # if type(res['metric_results'][k]) is np.ndarray:
+ # res['metric_results'][k] = res['metric_results'][k].tolist()
+ # metric_all.append(res['metric_results'])
+
+ # print('start saving to json done')
+ # with open(json_path+'/metric_record.json', "w", encoding="utf-8") as f2:
+ # json.dump(metric_all, f2, indent=4)
+ # print('save to json done')
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/adzoo/vad/train.py b/adzoo/vad/train.py
new file mode 100644
index 0000000..d880d4f
--- /dev/null
+++ b/adzoo/vad/train.py
@@ -0,0 +1,237 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+# Modified by Zhiqi Li
+# ---------------------------------------------
+
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.utils import get_dist_info, init_dist
+from os import path as osp
+
+
+from mmcv.datasets import build_dataset
+from mmcv.models import build_model
+from mmcv.utils import collect_env, get_root_logger
+from mmcv.utils import set_random_seed
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from adzoo.bevformer.mmdet3d_plugin.bevformer.apis.train import custom_train_model
+
+import cv2
+cv2.setNumThreads(1)
+
+import sys
+sys.path.append('')
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Train a detector')
+ parser.add_argument('config', help='train config file path')
+ parser.add_argument('--work-dir', help='the dir to save logs and models')
+ parser.add_argument(
+ '--resume-from', help='the checkpoint file to resume from')
+ parser.add_argument(
+ '--no-validate',
+ action='store_true',
+ help='whether not to evaluate the checkpoint during training')
+ group_gpus = parser.add_mutually_exclusive_group()
+ group_gpus.add_argument(
+ '--gpus',
+ type=int,
+ help='number of gpus to use '
+ '(only applicable to non-distributed training)')
+ group_gpus.add_argument(
+ '--gpu-ids',
+ type=int,
+ nargs='+',
+ help='ids of gpus to use '
+ '(only applicable to non-distributed training)')
+ parser.add_argument('--seed', type=int, default=0, help='random seed')
+ parser.add_argument(
+ '--deterministic',
+ action='store_true',
+ help='whether to set deterministic options for CUDNN backend.')
+ parser.add_argument(
+ '--options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file (deprecate), '
+ 'change to --cfg-options instead.')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ parser.add_argument(
+ '--launcher',
+ choices=['none', 'pytorch', 'slurm', 'mpi'],
+ default='none',
+ help='job launcher')
+ parser.add_argument('--local-rank', type=int, default=0)
+ parser.add_argument(
+ '--autoscale-lr',
+ action='store_true',
+ help='automatically scale lr with the number of gpus')
+ args = parser.parse_args()
+ if 'LOCAL_RANK' not in os.environ:
+ os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+ if args.options and args.cfg_options:
+ raise ValueError(
+ '--options and --cfg-options cannot be both specified, '
+ '--options is deprecated in favor of --cfg-options')
+ if args.options:
+ warnings.warn('--options is deprecated in favor of --cfg-options')
+ args.cfg_options = args.options
+
+ return args
+
+
+def main():
+ args = parse_args()
+
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+ # import modules from string list.
+ if cfg.get('custom_imports', None):
+ from mmcv.utils import import_modules_from_strings
+ import_modules_from_strings(**cfg['custom_imports'])
+
+ # set cudnn_benchmark
+ if cfg.get('cudnn_benchmark', False):
+ torch.backends.cudnn.benchmark = True
+ # set tf32
+ if cfg.get('close_tf32', False):
+ torch.backends.cuda.matmul.allow_tf32 = False
+ torch.backends.cudnn.allow_tf32 = False
+ # work_dir is determined in this priority: CLI > segment in file > filename
+ if args.work_dir is not None:
+ # update configs according to CLI args if args.work_dir is not None
+ cfg.work_dir = args.work_dir
+ elif cfg.get('work_dir', None) is None:
+ # use config filename as default work_dir if cfg.work_dir is None
+ cfg.work_dir = osp.join('./work_dirs',
+ osp.splitext(osp.basename(args.config))[0])
+ # if args.resume_from is not None:
+ if args.resume_from is not None and osp.isfile(args.resume_from):
+ cfg.resume_from = args.resume_from
+ if args.gpu_ids is not None:
+ cfg.gpu_ids = args.gpu_ids
+ else:
+ cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+ if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
+ cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
+ if args.autoscale_lr:
+ # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+ cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+ # init distributed env first, since logger depends on the dist info.
+ if args.launcher == 'none':
+ distributed = False
+ else:
+ distributed = True
+ init_dist(args.launcher, **cfg.dist_params)
+ # re-set gpu_ids with distributed training mode
+ _, world_size = get_dist_info()
+ cfg.gpu_ids = range(world_size)
+
+ # create work_dir
+ mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+ # dump config
+ cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+ # init the logger before other steps
+ timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+ log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+ # specify logger name, if we still use 'mmdet', the output info will be
+ # filtered and won't be saved in the log_file
+ # TODO: ugly workaround to judge whether we are training det or seg model
+ if cfg.model.type in ['EncoderDecoder3D']:
+ logger_name = 'mmseg'
+ else:
+ logger_name = 'mmdet'
+ logger = get_root_logger(
+ log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+ # init the meta dict to record some important information such as
+ # environment info and seed, which will be logged
+ meta = dict()
+ # log env info
+ env_info_dict = collect_env()
+ env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+ dash_line = '-' * 60 + '\n'
+ logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+ dash_line)
+ meta['env_info'] = env_info
+ meta['config'] = cfg.pretty_text
+
+ # log some basic info
+ logger.info(f'Distributed training: {distributed}')
+ logger.info(f'Config:\n{cfg.pretty_text}')
+
+ # set random seeds
+ if args.seed is not None:
+ logger.info(f'Set random seed to {args.seed}, '
+ f'deterministic: {args.deterministic}')
+ set_random_seed(args.seed, deterministic=args.deterministic)
+ cfg.seed = args.seed
+ meta['seed'] = args.seed
+ meta['exp_name'] = osp.basename(args.config)
+
+ model = build_model(
+ cfg.model,
+ train_cfg=cfg.get('train_cfg'),
+ test_cfg=cfg.get('test_cfg'))
+ model.init_weights()
+
+ logger.info(f'Model:\n{model}')
+ datasets = [build_dataset(cfg.data.train)]
+ if len(cfg.workflow) == 2:
+ val_dataset = copy.deepcopy(cfg.data.val)
+ # in case we use a dataset wrapper
+ if 'dataset' in cfg.data.train:
+ val_dataset.pipeline = cfg.data.train.dataset.pipeline
+ else:
+ val_dataset.pipeline = cfg.data.train.pipeline
+ # set test_mode=False here in deep copied config
+ # which do not affect AP/AR calculation later
+ # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa
+ val_dataset.test_mode = False
+ datasets.append(build_dataset(val_dataset))
+ if cfg.checkpoint_config is not None:
+ # save mmdet version, config file content and class names in
+ # checkpoints as meta data
+ cfg.checkpoint_config.meta = dict(
+ config=cfg.pretty_text,
+ CLASSES=datasets[0].CLASSES,
+ PALETTE=datasets[0].PALETTE # for segmentors
+ if hasattr(datasets[0], 'PALETTE') else None)
+ # add an attribute for visualization convenience
+ model.CLASSES = datasets[0].CLASSES
+ custom_train_model(
+ model,
+ datasets,
+ cfg,
+ distributed=distributed,
+ validate=(not args.no_validate),
+ timestamp=timestamp,
+ meta=meta)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/asserts/bench2drive.jpg b/asserts/bench2drive.jpg
new file mode 100644
index 0000000..2a8aad2
Binary files /dev/null and b/asserts/bench2drive.jpg differ
diff --git a/asserts/bench2drivezoo.png b/asserts/bench2drivezoo.png
new file mode 100644
index 0000000..f8b1b2a
Binary files /dev/null and b/asserts/bench2drivezoo.png differ
diff --git a/clear.py b/clear.py
new file mode 100644
index 0000000..c12b90f
--- /dev/null
+++ b/clear.py
@@ -0,0 +1,55 @@
+import os
+import ast
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def find_py_files(root_dir):
+ py_files = []
+ for root, dirs, files in os.walk(root_dir):
+ for file in files:
+ if file.endswith('.py'):
+ py_files.append(os.path.join(root, file))
+ return py_files
+
+def analyze_file(file_path):
+ with open(file_path, "r", encoding='utf-8') as file:
+ file_content = file.read()
+ tree = ast.parse(file_content)
+
+ classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
+ functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
+ return file_path, classes, functions
+
+def analyze_projects(root_dir):
+ py_files = find_py_files(root_dir)
+ results = []
+
+ with ThreadPoolExecutor(max_workers=4) as executor:
+ future_to_file = {executor.submit(analyze_file, file_path): file_path for file_path in py_files}
+ for future in as_completed(future_to_file):
+ file_path = future_to_file[future]
+ try:
+ results.append(future.result())
+ except Exception as exc:
+ print(f'{file_path} generated an exception: {exc}')
+ return results
+
+root_dir = "./"
+results = analyze_projects(root_dir)
+
+result_dict = {}
+
+for file_path, classes, functions in results:
+ print(f"File: {file_path}")
+ print(f"Classes: {classes}")
+ print(f"Functions: {functions}")
+ result_dict[file_path] = {}
+ result_dict[file_path][classes] = 0
+ result_dict[file_path][functions] = 0
+ result_dict[file_path][functions] = 0
+ result_dict[file_path]['count'] = 0
+
+import json
+
+out_file = open("myfile.json", "w")
+json.dump(result_dict, out_file, indent = 4)
+out_file.close()
\ No newline at end of file
diff --git a/data/others/b2d_motion_anchor_infos_mode6.pkl b/data/others/b2d_motion_anchor_infos_mode6.pkl
new file mode 100644
index 0000000..82465a7
Binary files /dev/null and b/data/others/b2d_motion_anchor_infos_mode6.pkl differ
diff --git a/data/splits/bench2drive_base_train_val_split.json b/data/splits/bench2drive_base_train_val_split.json
new file mode 100644
index 0000000..855d8ed
--- /dev/null
+++ b/data/splits/bench2drive_base_train_val_split.json
@@ -0,0 +1 @@
+{"train": ["v1/PedestrianCrossing_Town13_Route638_Weather14", "v1/BlockedIntersection_Town13_Route616_Weather18", "v1/LaneChange_Town13_Route725_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route603_Weather15", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route507_Weather13", "v1/ParkedObstacleTwoWays_Town12_Route1166_Weather22", "v1/StaticCutIn_Town15_Route429_Weather13", "v1/ParkingCutIn_Town12_Route1301_Weather14", "v1/StaticCutIn_Town03_Route158_Weather2", "v1/YieldToEmergencyVehicle_Town04_Route207_Weather25", "v1/Accident_Town12_Route956_Weather20", "v1/ParkingCutIn_Town12_Route1313_Weather3", "v1/HighwayExit_Town12_Route1324_Weather0", "v1/CrossingBicycleFlow_Town12_Route1050_Weather10", "v1/ParkingCutIn_Town12_Route762_Weather8", "v1/ParkedObstacle_Town15_Route415_Weather25", "v1/BlockedIntersection_Town15_Route486_Weather18", "v1/TJunction_Town13_Route655_Weather5", "v1/ParkedObstacleTwoWays_Town12_Route1167_Weather23", "v1/ParkingCutIn_Town13_Route1349_Weather10", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route885_Weather14", "v1/OppositeVehicleRunningRedLight_Town12_Route809_Weather3", "v1/ParkedObstacle_Town05_Route262_Weather1", "v1/VehicleTurningRoutePedestrian_Town13_Route680_Weather10", "v1/HazardAtSideLaneTwoWays_Town12_Route1128_Weather10", "v1/NonSignalizedJunctionLeftTurn_Town03_Route153_Weather26", "v1/SignalizedJunctionLeftTurn_Town10HD_Route380_Weather21", "v1/HighwayExit_Town06_Route292_Weather14", "v1/VanillaSignalizedTurnEncounterGreenLight_Town04_Route196_Weather14", "v1/HazardAtSideLane_Town12_Route1512_Weather7", "v1/ParkingCrossingPedestrian_Town12_Route758_Weather3", "v1/HazardAtSideLaneTwoWays_Town12_Route1140_Weather22", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route511_Weather9", "v1/HazardAtSideLaneTwoWays_Town12_Route1129_Weather11", "v1/VehicleTurningRoute_Town13_Route679_Weather3", "v1/InvadingTurn_Town13_Route575_Weather3", "v1/OppositeVehicleRunningRedLight_Town15_Route440_Weather23", "v1/NonSignalizedJunctionRightTurn_Town12_Route1024_Weather10", "v1/HazardAtSideLaneTwoWays_Town12_Route1133_Weather15", "v1/MergerIntoSlowTrafficV2_Town12_Route941_Weather5", "v1/VehicleTurningRoutePedestrian_Town12_Route826_Weather20", "v1/ConstructionObstacle_Town03_Route61_Weather9", "v1/ConstructionObstacleTwoWays_Town12_Route1098_Weather6", "v1/MergerIntoSlowTrafficV2_Town12_Route858_Weather0", "v1/HardBreakRoute_Town02_Route34_Weather8", "v1/LaneChange_Town13_Route743_Weather3", "v1/NonSignalizedJunctionLeftTurn_Town12_Route812_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route874_Weather8", "v1/YieldToEmergencyVehicle_Town13_Route560_Weather14", "v1/ParkingExit_Town13_Route567_Weather21", "v1/HazardAtSideLane_Town15_Route420_Weather3", "v1/ParkedObstacleTwoWays_Town13_Route1334_Weather26", "v1/HighwayExit_Town12_Route841_Weather9", "v1/TJunction_Town12_Route926_Weather8", "v1/HighwayExit_Town12_Route1000_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town10HD_Route388_Weather23", "v1/HighwayExit_Town13_Route705_Weather3", "v1/StaticCutIn_Town15_Route427_Weather11", "v1/StaticCutIn_Town13_Route563_Weather9", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route646_Weather22", "v1/AccidentTwoWays_Town12_Route1114_Weather22", "v1/OppositeVehicleRunningRedLight_Town05_Route268_Weather8", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route887_Weather3", "v1/HazardAtSideLane_Town12_Route1535_Weather10", "v1/ParkingCutIn_Town12_Route1304_Weather9", "v1/HighwayExit_Town12_Route937_Weather1", "v1/AccidentTwoWays_Town12_Route1110_Weather18", "v1/InterurbanAdvancedActorFlow_Town13_Route686_Weather10", "v1/HazardAtSideLane_Town03_Route105_Weather22", "v1/ParkingCrossingPedestrian_Town12_Route760_Weather6", "v1/OppositeVehicleTakingPriority_Town12_Route995_Weather7", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route661_Weather11", "v1/HardBreakRoute_Town06_Route46_Weather20", "v1/OppositeVehicleTakingPriority_Town04_Route189_Weather7", "v1/BlockedIntersection_Town07_Route352_Weather14", "v1/ConstructionObstacleTwoWays_Town12_Route1404_Weather26", "v1/AccidentTwoWays_Town12_Route1446_Weather2", "v1/ParkedObstacle_Town03_Route147_Weather0", "v1/HazardAtSideLaneTwoWays_Town12_Route1146_Weather2", "v1/VanillaSignalizedTurnEncounterGreenLight_Town04_Route197_Weather15", "v1/AccidentTwoWays_Town12_Route1456_Weather15", "v1/ParkingCutIn_Town12_Route955_Weather19", "v1/ParkedObstacle_Town13_Route553_Weather11", "v1/VanillaSignalizedTurnEncounterRedLight_Town03_Route141_Weather11", "v1/NonSignalizedJunctionRightTurn_Town12_Route817_Weather11", "v1/OppositeVehicleTakingPriority_Town03_Route128_Weather23", "v1/HighwayExit_Town13_Route749_Weather21", "v1/VanillaSignalizedTurnEncounterRedLight_Town03_Route142_Weather12", "v1/TJunction_Town07_Route364_Weather0", "v1/HardBreakRoute_Town07_Route47_Weather21", "v1/CrossingBicycleFlow_Town12_Route1062_Weather22", "v1/HazardAtSideLane_Town12_Route1527_Weather25", "v1/LaneChange_Town12_Route756_Weather2", "v1/VehicleTurningRoutePedestrian_Town13_Route703_Weather1", "v1/OppositeVehicleRunningRedLight_Town04_Route178_Weather22", "v1/OppositeVehicleTakingPriority_Town12_Route820_Weather14", "v1/Accident_Town12_Route769_Weather15", "v1/AccidentTwoWays_Town12_Route1469_Weather3", "v1/MergerIntoSlowTrafficV2_Town12_Route1010_Weather22", "v1/NonSignalizedJunctionLeftTurn_Town12_Route966_Weather3", "v1/TJunction_Town12_Route883_Weather25", "v1/OppositeVehicleRunningRedLight_Town12_Route807_Weather1", "v1/OppositeVehicleTakingPriority_Town12_Route994_Weather6", "v1/CrossingBicycleFlow_Town12_Route1077_Weather11", "v1/InvadingTurn_Town02_Route99_Weather21", "v1/YieldToEmergencyVehicle_Town12_Route917_Weather7", "v1/PedestrianCrossing_Town13_Route718_Weather8", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route640_Weather8", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route536_Weather8", "v1/HazardAtSideLane_Town15_Route421_Weather5", "v1/TJunction_Town12_Route1018_Weather3", "v1/ConstructionObstacle_Town05_Route69_Weather9", "v1/HardBreakRoute_Town13_Route1341_Weather26", "v1/NonSignalizedJunctionRightTurn_Town12_Route816_Weather15", "v1/LaneChange_Town12_Route984_Weather22", "v1/YieldToEmergencyVehicle_Town12_Route779_Weather25", "v1/Accident_Town03_Route102_Weather20", "v1/VehicleTurningRoute_Town15_Route480_Weather18", "v1/OppositeVehicleTakingPriority_Town04_Route188_Weather6", "v1/ParkingCutIn_Town13_Route1347_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route1019_Weather5", "v1/HardBreakRoute_Town15_Route59_Weather7", "v1/LaneChange_Town06_Route326_Weather3", "v1/EnterActorFlow_Town13_Route612_Weather14", "v1/DynamicObjectCrossing_Town12_Route21_Weather21", "v1/HardBreakRoute_Town10HD_Route49_Weather23", "v1/StaticCutIn_Town06_Route287_Weather1", "v1/HighwayExit_Town13_Route619_Weather21", "v1/InterurbanAdvancedActorFlow_Town12_Route1048_Weather8", "v1/SignalizedJunctionRightTurn_Town04_Route176_Weather20", "v1/EnterActorFlow_Town07_Route349_Weather11", "v1/CrossingBicycleFlow_Town12_Route860_Weather2", "v1/ConstructionObstacleTwoWays_Town12_Route1419_Weather26", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town03_Route143_Weather13", "v1/ParkedObstacle_Town03_Route103_Weather25", "v1/ParkingExit_Town13_Route569_Weather23", "v1/AccidentTwoWays_Town12_Route1454_Weather13", "v1/NonSignalizedJunctionRightTurn_Town13_Route595_Weather11", "v1/SignalizedJunctionRightTurn_Town07_Route339_Weather1", "v1/HazardAtSideLane_Town12_Route1506_Weather0", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route490_Weather22", "v1/ConstructionObstacle_Town13_Route82_Weather3", "v1/DynamicObjectCrossing_Town02_Route12_Weather12", "v1/HighwayExit_Town12_Route1051_Weather11", "v1/MergerIntoSlowTraffic_Town13_Route627_Weather3", "v1/YieldToEmergencyVehicle_Town05_Route225_Weather9", "v1/ControlLoss_Town15_Route430_Weather14", "v1/ParkingCutIn_Town13_Route546_Weather0", "v1/VanillaSignalizedTurnEncounterRedLight_Town15_Route452_Weather10", "v1/StaticCutIn_Town15_Route428_Weather12", "v1/ParkedObstacleTwoWays_Town12_Route1183_Weather7", "v1/PedestrianCrossing_Town15_Route448_Weather6", "v1/BlockedIntersection_Town04_Route193_Weather11", "v1/InterurbanAdvancedActorFlow_Town06_Route325_Weather13", "v1/ParkedObstacle_Town12_Route771_Weather9", "v1/AccidentTwoWays_Town12_Route1104_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town03_Route140_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route493_Weather25", "v1/TJunction_Town12_Route1201_Weather5", "v1/ParkedObstacle_Town04_Route162_Weather6", "v1/ParkingCutIn_Town13_Route670_Weather20", "v1/VehicleTurningRoute_Town13_Route605_Weather9", "v1/HazardAtSideLane_Town12_Route960_Weather20", "v1/StaticCutIn_Town04_Route168_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town10HD_Route393_Weather3", "v1/ParkingCutIn_Town12_Route764_Weather10", "v1/InterurbanAdvancedActorFlow_Town12_Route854_Weather22", "v1/HighwayExit_Town12_Route1028_Weather14", "v1/Accident_Town15_Route414_Weather23", "v1/ConstructionObstacleTwoWays_Town12_Route1425_Weather26", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route737_Weather23", "v1/ParkedObstacleTwoWays_Town12_Route1182_Weather12", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1356_Weather7", "v1/NonSignalizedJunctionLeftTurn_Town05_Route238_Weather26", "v1/HighwayCutIn_Town12_Route850_Weather18", "v1/HighwayCutIn_Town06_Route320_Weather8", "v1/HazardAtSideLaneTwoWays_Town12_Route1145_Weather1", "v1/ParkedObstacleTwoWays_Town12_Route1164_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route813_Weather26", "v1/HardBreakRoute_Town02_Route33_Weather7", "v1/ParkingExit_Town12_Route1305_Weather18", "v1/HighwayCutIn_Town12_Route974_Weather12", "v1/NonSignalizedJunctionLeftTurn_Town07_Route344_Weather6", "v1/ConstructionObstacleTwoWays_Town12_Route1084_Weather5", "v1/ParkingCutIn_Town13_Route671_Weather21", "v1/ConstructionObstacle_Town03_Route63_Weather11", "v1/EnterActorFlow_Town05_Route245_Weather9", "v1/VanillaSignalizedTurnEncounterRedLight_Town10HD_Route389_Weather25", "v1/SignalizedJunctionLeftTurn_Town15_Route471_Weather26", "v1/ConstructionObstacleTwoWays_Town12_Route1083_Weather9", "v1/HighwayCutIn_Town13_Route685_Weather9", "v1/HazardAtSideLaneTwoWays_Town12_Route1139_Weather21", "v1/CrossingBicycleFlow_Town12_Route1075_Weather9", "v1/HighwayCutIn_Town12_Route1006_Weather18", "v1/PedestrianCrossing_Town13_Route687_Weather11", "v1/ParkingExit_Town13_Route697_Weather21", "v1/ParkingExit_Town12_Route1309_Weather22", "v1/InterurbanAdvancedActorFlow_Town13_Route715_Weather13", "v1/ParkingCrossingPedestrian_Town12_Route896_Weather12", "v1/HazardAtSideLane_Town05_Route263_Weather3", "v1/TJunction_Town12_Route980_Weather18", "v1/OppositeVehicleTakingPriority_Town12_Route1025_Weather11", "v1/EnterActorFlow_Town13_Route614_Weather8", "v1/ParkingCrossingPedestrian_Town13_Route668_Weather18", "v1/ParkingExit_Town13_Route731_Weather3", "v1/ParkingCrossingPedestrian_Town13_Route669_Weather19", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route723_Weather21", "v1/HazardAtSideLane_Town12_Route1519_Weather8", "v1/HighwayCutIn_Town13_Route631_Weather7", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route719_Weather18", "v1/ConstructionObstacleTwoWays_Town12_Route1421_Weather26", "v1/VehicleTurningRoutePedestrian_Town12_Route999_Weather11", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route1022_Weather8", "v1/ControlLoss_Town15_Route432_Weather8", "v1/CrossingBicycleFlow_Town12_Route1063_Weather23", "v1/ParkingExit_Town12_Route1308_Weather21", "v1/Accident_Town13_Route552_Weather6", "v1/EnterActorFlow_Town13_Route613_Weather15", "v1/LaneChange_Town13_Route726_Weather21", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1633_Weather12", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1629_Weather6", "v1/NonSignalizedJunctionRightTurn_Town12_Route967_Weather25", "v1/CrossingBicycleFlow_Town12_Route1078_Weather12", "v1/SignalizedJunctionLeftTurn_Town13_Route581_Weather26", "v1/VehicleTurningRoute_Town12_Route1026_Weather12", "v1/ParkedObstacleTwoWays_Town12_Route1181_Weather11", "v1/VehicleTurningRoute_Town15_Route1370_Weather7", "v1/ParkingCrossingPedestrian_Town12_Route953_Weather9", "v1/VehicleTurningRoutePedestrian_Town13_Route702_Weather14", "v1/ParkingCutIn_Town13_Route547_Weather1", "v1/HardBreakRoute_Town07_Route48_Weather22", "v1/ConstructionObstacle_Town13_Route81_Weather3", "v1/YieldToEmergencyVehicle_Town12_Route778_Weather14", "v1/DynamicObjectCrossing_Town01_Route2_Weather2", "v1/HazardAtSideLaneTwoWays_Town12_Route1141_Weather23", "v1/HighwayCutIn_Town13_Route745_Weather9", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route658_Weather8", "v1/ConstructionObstacleTwoWays_Town12_Route1085_Weather6", "v1/DynamicObjectCrossing_Town15_Route29_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town04_Route202_Weather20", "v1/ParkingCutIn_Town12_Route1312_Weather2", "v1/ParkedObstacle_Town05_Route221_Weather13", "v1/ParkingExit_Town12_Route788_Weather8", "v1/HazardAtSideLane_Town12_Route1526_Weather23", "v1/AccidentTwoWays_Town12_Route1106_Weather14", "v1/BlockedIntersection_Town12_Route936_Weather0", "v1/AccidentTwoWays_Town12_Route1459_Weather18", "v1/ParkingCutIn_Town13_Route549_Weather3", "v1/HighwayCutIn_Town12_Route940_Weather3", "v1/HardBreakRoute_Town05_Route43_Weather9", "v1/InvadingTurn_Town13_Route578_Weather6", "v1/VehicleOpensDoorTwoWays_Town12_Route1197_Weather1", "v1/VehicleTurningRoutePedestrian_Town13_Route610_Weather12", "v1/HighwayCutIn_Town13_Route628_Weather3", "v1/ParkingExit_Town12_Route920_Weather10", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1627_Weather3", "v1/ParkingCrossingPedestrian_Town12_Route759_Weather5", "v1/HardBreakRoute_Town03_Route38_Weather12", "v1/SignalizedJunctionLeftTurn_Town15_Route437_Weather26", "v1/HazardAtSideLaneTwoWays_Town12_Route1152_Weather8", "v1/ConstructionObstacleTwoWays_Town12_Route1424_Weather26", "v1/ConstructionObstacleTwoWays_Town12_Route1095_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town03_Route145_Weather15", "v1/VehicleTurningRoute_Town15_Route1380_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town07_Route355_Weather9", "v1/ControlLoss_Town15_Route433_Weather9", "v1/NonSignalizedJunctionLeftTurn_Town13_Route594_Weather26", "v1/Accident_Town12_Route766_Weather12", "v1/SignalizedJunctionLeftTurn_Town13_Route580_Weather26", "v1/HighwayCutIn_Town06_Route300_Weather14", "v1/HazardAtSideLane_Town12_Route1515_Weather12", "v1/StaticCutIn_Town03_Route109_Weather1", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route1034_Weather20", "v1/HardBreakRoute_Town04_Route39_Weather13", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1636_Weather15", "v1/OppositeVehicleRunningRedLight_Town03_Route119_Weather12", "v1/OppositeVehicleRunningRedLight_Town12_Route991_Weather3", "v1/VehicleTurningRoute_Town13_Route700_Weather23", "v1/HazardAtSideLaneTwoWays_Town12_Route1135_Weather9", "v1/StaticCutIn_Town12_Route783_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route946_Weather10", "v1/AccidentTwoWays_Town12_Route1445_Weather1", "v1/BlockedIntersection_Town04_Route195_Weather13", "v1/ParkingCutIn_Town12_Route903_Weather20", "v1/HardBreakRoute_Town05_Route44_Weather18", "v1/VehicleTurningRoute_Town12_Route825_Weather21", "v1/HazardAtSideLane_Town12_Route775_Weather12", "v1/ParkingCutIn_Town13_Route548_Weather2", "v1/CrossingBicycleFlow_Town12_Route863_Weather5", "v1/HazardAtSideLane_Town06_Route283_Weather23", "v1/BlockedIntersection_Town05_Route248_Weather14", "v1/BlockedIntersection_Town07_Route351_Weather13", "v1/AccidentTwoWays_Town12_Route1444_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town05_Route240_Weather26", "v1/YieldToEmergencyVehicle_Town12_Route781_Weather8", "v1/HazardAtSideLane_Town13_Route558_Weather12", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route877_Weather19", "v1/NonSignalizedJunctionRightTurn_Town12_Route931_Weather21", "v1/ParkedObstacle_Town12_Route773_Weather19", "v1/HighwayCutIn_Town12_Route1042_Weather2", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route1020_Weather6", "v1/SignalizedJunctionRightTurn_Town12_Route804_Weather5", "v1/CrossingBicycleFlow_Town12_Route1066_Weather0", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route451_Weather9", "v1/ParkedObstacleTwoWays_Town12_Route1172_Weather2", "v1/StaticCutIn_Town05_Route227_Weather2", "v1/YieldToEmergencyVehicle_Town13_Route675_Weather25", "v1/DynamicObjectCrossing_Town12_Route22_Weather22", "v1/NonSignalizedJunctionRightTurn_Town13_Route596_Weather23", "v1/ParkedObstacle_Town10HD_Route371_Weather7", "v1/HazardAtSideLaneTwoWays_Town12_Route1155_Weather11", "v1/VehicleTurningRoute_Town15_Route519_Weather25", "v1/Accident_Town06_Route280_Weather11", "v1/AccidentTwoWays_Town12_Route1468_Weather2", "v1/InterurbanAdvancedActorFlow_Town06_Route330_Weather18", "v1/PedestrianCrossing_Town12_Route865_Weather7", "v1/ParkingCutIn_Town13_Route1348_Weather7", "v1/TJunction_Town13_Route654_Weather3", "v1/VehicleTurningRoutePedestrian_Town12_Route1040_Weather0", "v1/InterurbanAdvancedActorFlow_Town13_Route753_Weather25", "v1/HazardAtSideLane_Town12_Route1507_Weather1", "v1/OppositeVehicleRunningRedLight_Town03_Route120_Weather8", "v1/StaticCutIn_Town03_Route149_Weather19", "v1/OppositeVehicleRunningRedLight_Town13_Route587_Weather15", "v1/OppositeVehicleTakingPriority_Town12_Route932_Weather22", "v1/VanillaSignalizedTurnEncounterGreenLight_Town07_Route356_Weather18", "v1/PedestrianCrossing_Town13_Route637_Weather13", "v1/ParkedObstacle_Town13_Route555_Weather9", "v1/StaticCutIn_Town04_Route208_Weather0", "v1/LaneChange_Town06_Route277_Weather9", "v1/YieldToEmergencyVehicle_Town04_Route165_Weather7", "v1/EnterActorFlow_Town05_Route271_Weather11", "v1/ParkingCrossingPedestrian_Town13_Route728_Weather0", "v1/InvadingTurn_Town15_Route436_Weather20", "v1/AccidentTwoWays_Town12_Route1458_Weather9", "v1/ParkingExit_Town12_Route786_Weather6", "v1/CrossingBicycleFlow_Town12_Route1065_Weather25", "v1/ParkedObstacleTwoWays_Town12_Route1163_Weather19", "v1/OppositeVehicleRunningRedLight_Town15_Route475_Weather7", "v1/HighwayCutIn_Town06_Route321_Weather9", "v1/ParkingCrossingPedestrian_Town13_Route727_Weather25", "v1/TJunction_Town12_Route1017_Weather3", "v1/ParkingCutIn_Town12_Route1300_Weather13", "v1/HighwayExit_Town13_Route622_Weather23", "v1/Accident_Town12_Route1108_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town13_Route592_Weather26", "v1/DynamicObjectCrossing_Town01_Route8_Weather3", "v1/AccidentTwoWays_Town12_Route1119_Weather1", "v1/HardBreakRoute_Town12_Route53_Weather1", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route529_Weather9", "v1/VehicleTurningRoute_Town15_Route1376_Weather15", "v1/HazardAtSideLane_Town12_Route1537_Weather12", "v1/HazardAtSideLane_Town12_Route915_Weather7", "v1/ParkingExit_Town13_Route568_Weather22", "v1/DynamicObjectCrossing_Town15_Route28_Weather2", "v1/TJunction_Town02_Route97_Weather19", "v1/MergerIntoSlowTrafficV2_Town12_Route1009_Weather21", "v1/TJunction_Town01_Route90_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town04_Route201_Weather19", "v1/ParkingCrossingPedestrian_Town12_Route952_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1358_Weather11", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1631_Weather10", "v1/AccidentTwoWays_Town12_Route1121_Weather3", "v1/HardBreakRoute_Town05_Route42_Weather8", "v1/SignalizedJunctionLeftTurn_Town07_Route336_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town05_Route254_Weather20", "v1/ParkedObstacle_Town11_Route395_Weather5", "v1/ParkedObstacleTwoWays_Town12_Route1175_Weather5", "v1/InvadingTurn_Town12_Route796_Weather8", "v1/HazardAtSideLane_Town05_Route222_Weather0", "v1/MergerIntoSlowTrafficV2_Town12_Route1043_Weather3", "v1/EnterActorFlow_Town12_Route832_Weather0", "v1/AccidentTwoWays_Town12_Route1126_Weather8", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route531_Weather11", "v1/TJunction_Town13_Route691_Weather15", "v1/TJunction_Town01_Route91_Weather13", "v1/ParkedObstacleTwoWays_Town12_Route1161_Weather9", "v1/ParkedObstacle_Town06_Route328_Weather8", "v1/Accident_Town06_Route327_Weather15", "v1/ControlLoss_Town13_Route574_Weather19", "v1/HardBreakRoute_Town15_Route58_Weather6", "v1/HighwayExit_Town13_Route621_Weather23", "v1/NonSignalizedJunctionLeftTurn_Town05_Route239_Weather26", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town04_Route203_Weather21", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route530_Weather10", "v1/MergerIntoSlowTrafficV2_Town12_Route1055_Weather15", "v1/ConstructionObstacle_Town12_Route79_Weather1", "v1/BlockedIntersection_Town03_Route136_Weather6", "v1/ParkingExit_Town13_Route677_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town04_Route198_Weather8", "v1/TJunction_Town15_Route457_Weather15", "v1/OppositeVehicleTakingPriority_Town15_Route477_Weather9", "v1/VehicleTurningRoute_Town15_Route479_Weather11", "v1/ParkingCutIn_Town13_Route1346_Weather5", "v1/SignalizedJunctionLeftTurn_Town05_Route234_Weather7", "v1/AccidentTwoWays_Town12_Route1109_Weather9", "v1/ConstructionObstacleTwoWays_Town12_Route1094_Weather2", "v1/CrossingBicycleFlow_Town12_Route1072_Weather6", "v1/VanillaSignalizedTurnEncounterRedLight_Town15_Route492_Weather23", "v1/ConstructionObstacleTwoWays_Town12_Route1100_Weather14", "v1/NonSignalizedJunctionLeftTurn_Town04_Route182_Weather26", "v1/ParkingCrossingPedestrian_Town12_Route898_Weather14", "v1/ParkingExit_Town12_Route1316_Weather0", "v1/StaticCutIn_Town13_Route565_Weather8", "v1/SignalizedJunctionRightTurn_Town12_Route803_Weather23", "v1/HazardAtSideLane_Town12_Route1538_Weather13", "v1/MergerIntoSlowTraffic_Town13_Route626_Weather2", "v1/StaticCutIn_Town12_Route785_Weather5", "v1/HazardAtSideLaneTwoWays_Town12_Route1138_Weather20", "v1/HazardAtSideLane_Town12_Route1536_Weather11", "v1/OppositeVehicleTakingPriority_Town13_Route600_Weather2", "v1/HazardAtSideLane_Town12_Route1530_Weather2", "v1/OppositeVehicleTakingPriority_Town13_Route601_Weather3", "v1/HardBreakRoute_Town13_Route1337_Weather26", "v1/LaneChange_Town12_Route894_Weather10", "v1/HazardAtSideLane_Town06_Route329_Weather9", "v1/ParkingExit_Town13_Route732_Weather3", "v1/VehicleTurningRoute_Town15_Route1367_Weather3", "v1/HighwayExit_Town13_Route748_Weather20", "v1/Accident_Town03_Route101_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route1054_Weather14", "v1/ParkingCutIn_Town12_Route1314_Weather5", "v1/HighwayCutIn_Town06_Route322_Weather10", "v1/StaticCutIn_Town13_Route566_Weather20", "v1/AccidentTwoWays_Town12_Route1463_Weather22", "v1/CrossingBicycleFlow_Town12_Route1011_Weather23", "v1/HazardAtSideLaneTwoWays_Town12_Route1153_Weather9", "v1/ControlLoss_Town07_Route333_Weather21", "v1/TJunction_Town12_Route880_Weather22", "v1/Accident_Town12_Route957_Weather21", "v1/VanillaSignalizedTurnEncounterGreenLight_Town10HD_Route386_Weather22", "v1/AccidentTwoWays_Town12_Route1120_Weather2", "v1/InvadingTurn_Town12_Route925_Weather15", "v1/VehicleTurningRoute_Town12_Route822_Weather18", "v1/VehicleTurningRoutePedestrian_Town13_Route607_Weather19", "v1/HighwayExit_Town06_Route313_Weather1", "v1/Accident_Town04_Route205_Weather23", "v1/HazardAtSideLane_Town12_Route1508_Weather2", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route1035_Weather21", "v1/HazardAtSideLane_Town12_Route1533_Weather6", "v1/NonSignalizedJunctionLeftTurn_Town03_Route122_Weather26", "v1/PedestrianCrossing_Town15_Route526_Weather6", "v1/VanillaSignalizedTurnEncounterRedLight_Town04_Route199_Weather9", "v1/MergerIntoSlowTraffic_Town12_Route1003_Weather8", "v1/VehicleTurningRoute_Town15_Route1374_Weather13", "v1/VehicleTurningRoutePedestrian_Town12_Route970_Weather8", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route888_Weather3", "v1/MergerIntoSlowTrafficV2_Town12_Route1053_Weather13", "v1/EnterActorFlow_Town12_Route830_Weather23", "v1/YieldToEmergencyVehicle_Town13_Route562_Weather15", "v1/AccidentTwoWays_Town12_Route1453_Weather12", "v1/OppositeVehicleRunningRedLight_Town07_Route368_Weather3", "v1/HardBreakRoute_Town06_Route45_Weather19", "v1/HighwayCutIn_Town13_Route630_Weather6", "v1/ParkingExit_Town12_Route923_Weather13", "v1/LaneChange_Town12_Route983_Weather5", "v1/ParkingExit_Town13_Route676_Weather0", "v1/StaticCutIn_Town05_Route275_Weather15", "v1/InvadingTurn_Town04_Route217_Weather9", "v1/AccidentTwoWays_Town12_Route1455_Weather14", "v1/HighwayExit_Town12_Route1327_Weather3", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route660_Weather10", "v1/TJunction_Town07_Route365_Weather1", "v1/SignalizedJunctionRightTurn_Town12_Route988_Weather0", "v1/AccidentTwoWays_Town12_Route1124_Weather18", "v1/HighwayCutIn_Town13_Route734_Weather6", "v1/VehicleTurningRoute_Town13_Route698_Weather22", "v1/OppositeVehicleTakingPriority_Town05_Route270_Weather6", "v1/TJunction_Town15_Route496_Weather2", "v1/EnterActorFlow_Town07_Route350_Weather12", "v1/ParkingCutIn_Town13_Route1345_Weather3", "v1/HazardAtSideLane_Town12_Route1532_Weather5", "v1/PedestrianCrossing_Town12_Route867_Weather9", "v1/ConstructionObstacle_Town06_Route73_Weather21", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route721_Weather19", "v1/TJunction_Town15_Route495_Weather1", "v1/HighwayExit_Town12_Route1041_Weather1", "v1/ParkedObstacleTwoWays_Town12_Route1178_Weather8", "v1/ParkedObstacleTwoWays_Town12_Route1159_Weather23", "v1/NonSignalizedJunctionLeftTurn_Town03_Route124_Weather26", "v1/HardBreakRoute_Town13_Route1338_Weather26", "v1/PedestrianCrossing_Town12_Route1045_Weather5", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route875_Weather9", "v1/HardBreakRoute_Town13_Route1339_Weather26", "v1/CrossingBicycleFlow_Town12_Route1071_Weather0", "v1/ParkingCrossingPedestrian_Town12_Route897_Weather13", "v1/PedestrianCrossing_Town12_Route943_Weather7", "v1/VehicleTurningRoutePedestrian_Town15_Route482_Weather20", "v1/AccidentTwoWays_Town12_Route1461_Weather20", "v1/HazardAtSideLaneTwoWays_Town12_Route1130_Weather12", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route535_Weather15", "v1/ConstructionObstacle_Town05_Route68_Weather8", "v1/StaticCutIn_Town04_Route216_Weather8", "v1/HazardAtSideLaneTwoWays_Town12_Route1154_Weather10", "v1/ParkingExit_Town12_Route1321_Weather6", "v1/HardBreakRoute_Town13_Route54_Weather2", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route512_Weather18", "v1/SignalizedJunctionLeftTurn_Town12_Route1470_Weather5", "v1/SignalizedJunctionRightTurn_Town15_Route473_Weather5", "v1/ConstructionObstacleTwoWays_Town12_Route1414_Weather26", "v1/ParkingExit_Town13_Route678_Weather2", "v1/VehicleTurningRoute_Town15_Route1368_Weather5", "v1/NonSignalizedJunctionRightTurn_Town04_Route185_Weather9", "v1/HighwayCutIn_Town12_Route1005_Weather9", "v1/HardBreakRoute_Town13_Route1340_Weather26", "v1/ParkingCrossingPedestrian_Town12_Route899_Weather15", "v1/InvadingTurn_Town15_Route434_Weather18", "v1/VehicleTurningRoute_Town15_Route443_Weather1", "v1/DynamicObjectCrossing_Town15_Route27_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town10HD_Route392_Weather2", "v1/HardBreakRoute_Town15_Route57_Weather5", "v1/ParkingExit_Town12_Route789_Weather9", "v1/BlockedIntersection_Town13_Route617_Weather19", "v1/TJunction_Town05_Route259_Weather0", "v1/AccidentTwoWays_Town12_Route1112_Weather20", "v1/SignalizedJunctionRightTurn_Town04_Route211_Weather3", "v1/ParkedObstacleTwoWays_Town12_Route1171_Weather1", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route722_Weather20", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route643_Weather19", "v1/HazardAtSideLane_Town12_Route1516_Weather13", "v1/YieldToEmergencyVehicle_Town12_Route918_Weather8", "v1/HazardAtSideLaneTwoWays_Town12_Route1131_Weather13", "v1/VanillaSignalizedTurnEncounterRedLight_Town07_Route358_Weather20", "v1/InvadingTurn_Town05_Route230_Weather22", "v1/ParkingCutIn_Town12_Route765_Weather11", "v1/Accident_Town05_Route219_Weather11", "v1/ParkingCutIn_Town12_Route954_Weather18", "v1/DynamicObjectCrossing_Town01_Route7_Weather7", "v1/OppositeVehicleTakingPriority_Town12_Route1038_Weather23", "v1/PedestrianCrossing_Town12_Route1013_Weather25", "v1/CrossingBicycleFlow_Town12_Route1061_Weather21", "v1/OppositeVehicleRunningRedLight_Town03_Route121_Weather13", "v1/VanillaSignalizedTurnEncounterGreenLight_Town03_Route139_Weather9", "v1/HardBreakRoute_Town04_Route41_Weather15", "v1/HardBreakRoute_Town04_Route40_Weather14", "v1/ParkingCrossingPedestrian_Town15_Route462_Weather20", "v1/ConstructionObstacleTwoWays_Town12_Route1089_Weather23", "v1/TJunction_Town06_Route304_Weather18", "v1/MergerIntoSlowTrafficV2_Town12_Route1060_Weather20", "v1/StaticCutIn_Town15_Route426_Weather10", "v1/HazardAtSideLaneTwoWays_Town12_Route1144_Weather0", "v1/TJunction_Town12_Route881_Weather23", "v1/ParkingCutIn_Town12_Route1303_Weather8", "v1/SignalizedJunctionLeftTurn_Town03_Route114_Weather6", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route690_Weather14", "v1/MergerIntoSlowTraffic_Town12_Route844_Weather12", "v1/OppositeVehicleRunningRedLight_Town13_Route588_Weather8", "v1/HazardAtSideLane_Town12_Route774_Weather20", "v1/BlockedIntersection_Town15_Route485_Weather9", "v1/SignalizedJunctionLeftTurn_Town03_Route150_Weather26", "v1/ConstructionObstacle_Town12_Route76_Weather23", "v1/StaticCutIn_Town06_Route288_Weather2", "v1/MergerIntoSlowTraffic_Town12_Route1004_Weather8", "v1/StaticCutIn_Town05_Route265_Weather5", "v1/HardBreakRoute_Town03_Route37_Weather11", "v1/DynamicObjectCrossing_Town01_Route6_Weather6", "v1/CrossingBicycleFlow_Town12_Route1032_Weather18", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route455_Weather13", "v1/ConstructionObstacleTwoWays_Town12_Route1410_Weather26", "v1/ParkedObstacle_Town13_Route554_Weather12", "v1/SignalizedJunctionLeftTurn_Town07_Route366_Weather2", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route689_Weather13", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route724_Weather22", "v1/ParkingCutIn_Town12_Route1311_Weather1", "v1/OppositeVehicleRunningRedLight_Town05_Route235_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town05_Route252_Weather21", "v1/LaneChange_Town12_Route757_Weather10", "v1/VehicleTurningRoute_Town15_Route1377_Weather8", "v1/ParkingCrossingPedestrian_Town15_Route514_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town03_Route123_Weather26", "v1/CrossingBicycleFlow_Town12_Route1076_Weather10", "v1/ParkingCrossingPedestrian_Town15_Route513_Weather19", "v1/ParkedObstacle_Town12_Route772_Weather11", "v1/OppositeVehicleTakingPriority_Town04_Route187_Weather5", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route651_Weather1", "v1/SignalizedJunctionRightTurn_Town07_Route338_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town07_Route362_Weather23", "v1/Accident_Town04_Route159_Weather3", "v1/NonSignalizedJunctionLeftTurn_Town12_Route810_Weather10", "v1/MergerIntoSlowTraffic_Town12_Route845_Weather13", "v1/Accident_Town06_Route279_Weather19", "v1/ParkingCutIn_Town13_Route1344_Weather2", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route644_Weather20", "v1/OppositeVehicleTakingPriority_Town05_Route242_Weather15", "v1/DynamicObjectCrossing_Town01_Route5_Weather2", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1351_Weather1", "v1/HazardAtSideLane_Town12_Route1521_Weather18", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town03_Route144_Weather14", "v1/SignalizedJunctionLeftTurn_Town05_Route267_Weather3", "v1/VehicleOpensDoorTwoWays_Town12_Route1196_Weather0", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route494_Weather0", "v1/ConstructionObstacle_Town13_Route80_Weather2", "v1/ConstructionObstacle_Town03_Route62_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town07_Route361_Weather23", "v1/OppositeVehicleRunningRedLight_Town13_Route590_Weather18", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route533_Weather13", "v1/PedestrianCrossing_Town12_Route1033_Weather20", "v1/Accident_Town13_Route550_Weather3", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route948_Weather9", "v1/LaneChange_Town12_Route892_Weather8", "v1/HardBreakRoute_Town13_Route56_Weather3", "v1/DynamicObjectCrossing_Town13_Route24_Weather23", "v1/DynamicObjectCrossing_Town01_Route1_Weather1", "v1/MergerIntoSlowTrafficV2_Town12_Route859_Weather1", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route645_Weather21", "v1/TJunction_Town12_Route947_Weather11", "v1/NonSignalizedJunctionLeftTurn_Town04_Route181_Weather15", "v1/ConstructionObstacle_Town05_Route70_Weather18", "v1/HazardAtSideLaneTwoWays_Town12_Route1134_Weather8", "v1/SignalizedJunctionLeftTurn_Town04_Route172_Weather8", "v1/ConstructionObstacleTwoWays_Town12_Route1086_Weather20", "v1/ParkedObstacleTwoWays_Town13_Route1336_Weather26", "v1/HardBreakRoute_Town01_Route30_Weather3", "v1/InterurbanAdvancedActorFlow_Town06_Route301_Weather15", "v1/HazardAtSideLane_Town12_Route1524_Weather21", "v1/HighwayExit_Town13_Route683_Weather7", "v1/ParkedObstacle_Town12_Route958_Weather22", "v1/HazardAtSideLaneTwoWays_Town12_Route1136_Weather18", "v1/BlockedIntersection_Town05_Route272_Weather12", "v1/VehicleTurningRoutePedestrian_Town15_Route1387_Weather1", "v1/OppositeVehicleTakingPriority_Town05_Route243_Weather9", "v1/HazardAtSideLane_Town12_Route1523_Weather20", "v1/NonSignalizedJunctionRightTurn_Town13_Route598_Weather0", "v1/HighwayCutIn_Town13_Route750_Weather22", "v1/HardBreakRoute_Town11_Route50_Weather23", "v1/ControlLoss_Town07_Route332_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1352_Weather2", "v1/VanillaSignalizedTurnEncounterRedLight_Town04_Route200_Weather18", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route872_Weather14", "v1/HighwayCutIn_Town06_Route299_Weather13", "v1/Accident_Town12_Route768_Weather14", "v1/InvadingTurn_Town05_Route231_Weather23", "v1/DynamicObjectCrossing_Town10HD_Route18_Weather18", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route537_Weather9", "v1/ConstructionObstacleTwoWays_Town12_Route1415_Weather26", "v1/SignalizedJunctionRightTurn_Town13_Route583_Weather11", "v1/BlockedIntersection_Town07_Route353_Weather15", "v1/ParkedObstacle_Town15_Route417_Weather1", "v1/ParkingExit_Town12_Route1318_Weather2", "v1/InvadingTurn_Town05_Route266_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1630_Weather7", "v1/ConstructionObstacleTwoWays_Town12_Route1091_Weather12", "v1/HazardAtSideLaneTwoWays_Town12_Route1132_Weather14", "v1/BlockedIntersection_Town12_Route834_Weather2", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town05_Route257_Weather23", "v1/VehicleTurningRoute_Town15_Route1379_Weather18", "v1/HazardAtSideLane_Town12_Route1520_Weather9", "v1/DynamicObjectCrossing_Town12_Route23_Weather23", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route460_Weather18", "v1/PedestrianCrossing_Town15_Route506_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route969_Weather7", "v1/StaticCutIn_Town03_Route110_Weather6", "v1/ParkingExit_Town12_Route787_Weather7", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route945_Weather9", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route499_Weather5", "v1/ParkedObstacle_Town06_Route281_Weather21", "v1/VanillaSignalizedTurnEncounterRedLight_Town05_Route255_Weather21", "v1/TJunction_Town12_Route927_Weather9", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route641_Weather9", "v1/TJunction_Town06_Route305_Weather19", "v1/AccidentTwoWays_Town12_Route1117_Weather25", "v1/InterurbanAdvancedActorFlow_Town06_Route302_Weather21", "v1/ControlLoss_Town11_Route402_Weather12", "v1/EnterActorFlow_Town13_Route681_Weather5", "v1/HazardAtSideLaneTwoWays_Town12_Route1143_Weather25", "v1/ConstructionObstacleTwoWays_Town12_Route1406_Weather26", "v1/ParkedObstacle_Town12_Route959_Weather23", "v1/VehicleTurningRoute_Town15_Route1369_Weather6", "v1/NonSignalizedJunctionRightTurn_Town07_Route345_Weather14", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route868_Weather10", "v1/NonSignalizedJunctionLeftTurn_Town07_Route369_Weather18", "v1/TJunction_Town15_Route510_Weather8", "v1/VehicleTurningRoute_Town13_Route606_Weather18", "v1/DynamicObjectCrossing_Town02_Route14_Weather14", "v1/DynamicObjectCrossing_Town10HD_Route19_Weather19", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route879_Weather21", "v1/SignalizedJunctionRightTurn_Town04_Route177_Weather13", "v1/OppositeVehicleTakingPriority_Town12_Route818_Weather12", "v1/EnterActorFlow_Town12_Route831_Weather25", "v1/AccidentTwoWays_Town12_Route1105_Weather13", "v1/HazardAtSideLane_Town03_Route106_Weather23", "v1/HighwayExit_Town12_Route1326_Weather2", "v1/CrossingBicycleFlow_Town12_Route1067_Weather1", "v1/SignalizedJunctionRightTurn_Town12_Route964_Weather2", "v1/SignalizedJunctionRightTurn_Town12_Route805_Weather25", "v1/PedestrianCrossing_Town13_Route736_Weather8", "v1/AccidentTwoWays_Town12_Route1113_Weather21", "v1/ConstructionObstacleTwoWays_Town12_Route1418_Weather26", "v1/HighwayExit_Town13_Route620_Weather22", "v1/Accident_Town15_Route411_Weather21", "v1/HazardAtSideLane_Town12_Route961_Weather25", "v1/LaneChange_Town13_Route664_Weather14", "v1/HazardAtSideLaneTwoWays_Town12_Route1137_Weather19", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route650_Weather0", "v1/HighwayCutIn_Town12_Route1052_Weather12", "v1/HighwayCutIn_Town13_Route751_Weather23", "v1/HighwayCutIn_Town12_Route851_Weather19", "v1/HazardAtSideLaneTwoWays_Town12_Route1147_Weather3", "v1/BlockedIntersection_Town05_Route247_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route944_Weather8", "v1/SignalizedJunctionRightTurn_Town03_Route151_Weather2", "v1/HardBreakRoute_Town12_Route52_Weather0", "v1/HardBreakRoute_Town03_Route36_Weather10", "v1/NonSignalizedJunctionLeftTurn_Town13_Route593_Weather26", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1350_Weather0", "v1/HazardAtSideLane_Town12_Route1518_Weather15", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route659_Weather3", "v1/OppositeVehicleRunningRedLight_Town05_Route236_Weather10", "v1/EnterActorFlow_Town07_Route348_Weather10", "v1/MergerIntoSlowTrafficV2_Town12_Route1031_Weather9", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route454_Weather12", "v1/ConstructionObstacle_Town04_Route67_Weather15", "v1/ParkedObstacle_Town03_Route157_Weather1", "v1/CrossingBicycleFlow_Town12_Route1074_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1353_Weather3", "v1/HazardAtSideLane_Town04_Route164_Weather8", "v1/HighwayExit_Town13_Route706_Weather3", "v1/ParkedObstacle_Town12_Route770_Weather8", "v1/VehicleTurningRoute_Town12_Route933_Weather23", "v1/YieldToEmergencyVehicle_Town15_Route423_Weather7", "v1/Accident_Town13_Route551_Weather5", "v1/MergerIntoSlowTrafficV2_Town12_Route1057_Weather9", "v1/InterurbanAdvancedActorFlow_Town13_Route735_Weather7", "v1/OppositeVehicleRunningRedLight_Town12_Route929_Weather19", "v1/MergerIntoSlowTraffic_Town12_Route973_Weather11", "v1/ParkingExit_Town13_Route570_Weather23", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route738_Weather10", "v1/SignalizedJunctionLeftTurn_Town10HD_Route381_Weather22", "v1/YieldToEmergencyVehicle_Town13_Route561_Weather14", "v1/ConstructionObstacleTwoWays_Town12_Route1099_Weather7", "v1/DynamicObjectCrossing_Town02_Route13_Weather6", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route982_Weather20", "v1/LaneChange_Town13_Route739_Weather25", "v1/ConstructionObstacle_Town12_Route75_Weather23", "v1/AccidentTwoWays_Town12_Route1102_Weather10", "v1/ParkingCrossingPedestrian_Town15_Route403_Weather13", "v1/ControlLoss_Town04_Route169_Weather13", "v1/HazardAtSideLaneTwoWays_Town12_Route1142_Weather23", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route648_Weather23", "v1/HardBreakRoute_Town02_Route35_Weather9", "v1/HazardAtSideLaneTwoWays_Town12_Route1157_Weather13", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route878_Weather20", "v1/CrossingBicycleFlow_Town12_Route1068_Weather2", "v1/ConstructionObstacleTwoWays_Town12_Route1080_Weather14", "v1/AccidentTwoWays_Town12_Route1127_Weather9", "v1/HighwayExit_Town12_Route1046_Weather6", "v1/ConstructionObstacleTwoWays_Town12_Route1413_Weather26", "v1/ParkedObstacleTwoWays_Town12_Route1168_Weather23", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route891_Weather7", "v1/HighwayCutIn_Town12_Route849_Weather9", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1361_Weather14", "v1/HighwayExit_Town06_Route291_Weather5", "v1/DynamicObjectCrossing_Town13_Route25_Weather25", "v1/InterurbanAdvancedActorFlow_Town06_Route331_Weather19", "v1/TJunction_Town15_Route456_Weather14", "v1/VehicleTurningRoutePedestrian_Town12_Route1027_Weather13", "v1/VehicleTurningRoutePedestrian_Town12_Route827_Weather22", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route870_Weather12", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route532_Weather12", "v1/VanillaSignalizedTurnEncounterGreenLight_Town05_Route250_Weather8", "v1/VehicleTurningRoute_Town15_Route1378_Weather9", "v1/SignalizedJunctionLeftTurn_Town03_Route113_Weather26", "v1/AccidentTwoWays_Town12_Route1111_Weather19", "v1/MergerIntoSlowTrafficV2_Town12_Route1058_Weather18", "v1/ConstructionObstacle_Town04_Route66_Weather14", "v1/TJunction_Town12_Route882_Weather23", "v1/ParkedObstacleTwoWays_Town12_Route1158_Weather14", "v1/OppositeVehicleRunningRedLight_Town03_Route152_Weather22", "v1/HighwayCutIn_Town06_Route298_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1359_Weather12", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route981_Weather19", "v1/Accident_Town03_Route156_Weather0", "v1/NonSignalizedJunctionRightTurn_Town04_Route184_Weather2", "v1/ParkedObstacle_Town06_Route309_Weather23", "v1/MergerIntoSlowTrafficV2_Town12_Route1056_Weather8", "v1/PedestrianCrossing_Town12_Route1014_Weather0", "v1/ConstructionObstacleTwoWays_Town12_Route1097_Weather5", "v1/CrossingBicycleFlow_Town12_Route1073_Weather7", "v1/CrossingBicycleFlow_Town12_Route1064_Weather23", "v1/InterurbanAdvancedActorFlow_Town06_Route303_Weather22", "v1/OppositeVehicleRunningRedLight_Town04_Route179_Weather14", "v1/ParkingExit_Town12_Route1319_Weather3", "v1/ParkingCutIn_Town12_Route901_Weather9", "v1/VehicleTurningRoute_Town13_Route699_Weather23", "v1/AccidentTwoWays_Town12_Route1448_Weather5", "v1/HardBreakRoute_Town13_Route55_Weather3", "v1/HazardAtSideLaneTwoWays_Town12_Route1156_Weather12", "v1/SignalizedJunctionLeftTurn_Town05_Route233_Weather6", "v1/ConstructionObstacleTwoWays_Town12_Route1422_Weather26", "v1/CrossingBicycleFlow_Town12_Route1044_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route509_Weather15", "v1/ParkingExit_Town12_Route1320_Weather5", "v1/VehicleTurningRoute_Town12_Route997_Weather9", "v1/ConstructionObstacle_Town15_Route85_Weather7", "v1/DynamicObjectCrossing_Town13_Route26_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1357_Weather10", "v1/VehicleTurningRoutePedestrian_Town12_Route829_Weather25", "v1/HardBreakRoute_Town01_Route31_Weather5", "v1/AccidentTwoWays_Town12_Route1116_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route647_Weather23", "v1/BlockedIntersection_Town04_Route194_Weather12", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route871_Weather13", "v1/HazardAtSideLane_Town12_Route1528_Weather0", "v1/HazardAtSideLane_Town12_Route777_Weather23", "v1/TJunction_Town07_Route363_Weather25", "v1/DynamicObjectCrossing_Town11_Route20_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1355_Weather6", "v1/VanillaSignalizedTurnEncounterGreenLight_Town05_Route251_Weather9", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route692_Weather8", "v1/ParkedObstacleTwoWays_Town12_Route1169_Weather25", "v1/EnterActorFlow_Town04_Route192_Weather10", "v1/ParkingCutIn_Town13_Route1342_Weather0", "v1/DynamicObjectCrossing_Town01_Route4_Weather3", "v1/SignalizedJunctionLeftTurn_Town07_Route334_Weather26", "v1/BlockedIntersection_Town13_Route615_Weather9", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town07_Route360_Weather22", "v1/HazardAtSideLane_Town12_Route1525_Weather22", "v1/ControlLoss_Town15_Route431_Weather15", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town05_Route258_Weather25", "v1/BlockedIntersection_Town03_Route134_Weather3", "v1/NonSignalizedJunctionLeftTurn_Town04_Route212_Weather26", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route884_Weather0", "v1/ParkingCutIn_Town12_Route1315_Weather6", "v1/ParkingCutIn_Town13_Route672_Weather22", "v1/VanillaSignalizedTurnEncounterRedLight_Town07_Route357_Weather19", "v1/ParkedObstacle_Town05_Route273_Weather7", "v1/YieldToEmergencyVehicle_Town15_Route425_Weather9", "v1/MergerIntoSlowTrafficV2_Town12_Route856_Weather23", "v1/ParkedObstacleTwoWays_Town12_Route1173_Weather3", "v1/SignalizedJunctionLeftTurn_Town04_Route174_Weather18", "v1/ConstructionObstacle_Town15_Route84_Weather6", "v1/InterurbanActorFlow_Town12_Route1296_Weather7", "v1/HighwayExit_Town06_Route311_Weather25", "v1/LaneChange_Town13_Route740_Weather0", "v1/HazardAtSideLane_Town12_Route1534_Weather7", "v1/ParkingCutIn_Town12_Route763_Weather9", "v1/ParkingCutIn_Town12_Route1310_Weather0", "v1/ParkingCutIn_Town13_Route729_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route642_Weather18", "v1/PedestrianCrossing_Town13_Route716_Weather14", "v1/PedestrianCrossing_Town12_Route864_Weather6", "v1/ConstructionObstacle_Town03_Route60_Weather8", "v1/HazardAtSideLane_Town05_Route223_Weather15", "v1/Accident_Town15_Route412_Weather22", "v1/ConstructionObstacle_Town04_Route65_Weather13", "v1/ParkedObstacleTwoWays_Town13_Route1335_Weather26", "v1/PedestrianCrossing_Town12_Route866_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route930_Weather20", "v1/VanillaSignalizedTurnEncounterGreenLight_Town03_Route137_Weather7", "v1/HighwayCutIn_Town12_Route1047_Weather7", "v1/ParkingCutIn_Town13_Route696_Weather20", "v1/ParkedObstacleTwoWays_Town12_Route1165_Weather21", "v1/HighwayCutIn_Town13_Route629_Weather5", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route1015_Weather1", "v1/HardBreakRoute_Town12_Route51_Weather25", "v1/YieldToEmergencyVehicle_Town12_Route919_Weather11", "v1/InvadingTurn_Town12_Route924_Weather14", "v1/ConstructionObstacle_Town13_Route83_Weather5", "v1/HighwayExit_Town13_Route744_Weather8", "v1/Accident_Town12_Route767_Weather13", "v1/ParkedObstacleTwoWays_Town12_Route1174_Weather3", "v1/Accident_Town03_Route146_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1363_Weather8", "v1/DynamicObjectCrossing_Town02_Route15_Weather15", "v1/NonSignalizedJunctionLeftTurn_Town04_Route183_Weather1", "v1/SignalizedJunctionLeftTurn_Town05_Route232_Weather23", "v1/ParkingCrossingPedestrian_Town15_Route405_Weather15", "v1/VehicleTurningRoutePedestrian_Town13_Route609_Weather21", "v1/HighwayExit_Town13_Route704_Weather2", "v1/BlockedIntersection_Town10HD_Route391_Weather1", "v1/HighwayExit_Town12_Route1331_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route876_Weather18", "v1/YieldToEmergencyVehicle_Town13_Route673_Weather23", "v1/OppositeVehicleRunningRedLight_Town12_Route990_Weather2", "v1/TJunction_Town05_Route261_Weather1", "v1/StaticCutIn_Town12_Route782_Weather2", "v1/HighwayExit_Town12_Route838_Weather6", "v1/VanillaSignalizedTurnEncounterRedLight_Town05_Route253_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town10HD_Route387_Weather23", "v1/ConstructionObstacle_Town15_Route87_Weather9", "v1/TJunction_Town13_Route652_Weather2", "v1/ParkingCutIn_Town12_Route1302_Weather15", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route498_Weather23", "v1/YieldToEmergencyVehicle_Town15_Route424_Weather8", "v1/CrossingBicycleFlow_Town12_Route1070_Weather3", "v1/ConstructionObstacle_Town04_Route64_Weather12", "v1/ParkingCutIn_Town12_Route902_Weather18", "v1/MergerIntoSlowTrafficV2_Town12_Route1059_Weather19", "v1/MergerIntoSlowTrafficV2_Town12_Route976_Weather14", "v1/SignalizedJunctionLeftTurn_Town12_Route799_Weather0", "v1/InterurbanAdvancedActorFlow_Town12_Route1030_Weather8", "v1/YieldToEmergencyVehicle_Town03_Route148_Weather18", "v1/ConstructionObstacle_Town06_Route72_Weather20", "v1/PedestrianCrossing_Town13_Route636_Weather12", "v1/SignalizedJunctionRightTurn_Town15_Route438_Weather7", "v1/Accident_Town04_Route160_Weather3", "v1/Accident_Town12_Route1122_Weather3", "v1/Accident_Town06_Route308_Weather22", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1362_Weather15", "v1/CrossingBicycleFlow_Town12_Route1012_Weather23", "v1/OppositeVehicleTakingPriority_Town05_Route241_Weather7", "v1/InterurbanAdvancedActorFlow_Town13_Route634_Weather10", "v1/DynamicObjectCrossing_Town02_Route9_Weather9", "v1/Accident_Town15_Route413_Weather23", "v1/ParkedObstacle_Town13_Route556_Weather10", "v1/OppositeVehicleTakingPriority_Town12_Route968_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route720_Weather19", "v1/NonSignalizedJunctionRightTurn_Town07_Route346_Weather15", "v1/BlockedIntersection_Town13_Route618_Weather20", "v1/HighwayCutIn_Town13_Route713_Weather11", "v1/AccidentTwoWays_Town12_Route1103_Weather11", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route1016_Weather2", "v1/ParkedObstacle_Town15_Route418_Weather2", "v1/ParkedObstacle_Town05_Route220_Weather12", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town05_Route256_Weather22", "v1/VehicleTurningRoutePedestrian_Town15_Route523_Weather2", "v1/ParkingCrossingPedestrian_Town12_Route761_Weather7", "v1/EnterActorFlow_Town13_Route611_Weather13", "v1/MergerIntoSlowTrafficV2_Town15_Route525_Weather5", "v1/HazardAtSideLane_Town12_Route1517_Weather14", "v1/ParkingExit_Town12_Route1322_Weather7", "v1/ConstructionObstacle_Town12_Route77_Weather25", "v1/ConstructionObstacle_Town05_Route71_Weather19", "v1/StaticCutIn_Town13_Route564_Weather18", "v1/ConstructionObstacleTwoWays_Town12_Route1096_Weather3", "v1/BlockedIntersection_Town12_Route835_Weather3", "v1/HighwayExit_Town06_Route293_Weather15", "v1/NonSignalizedJunctionRightTurn_Town12_Route815_Weather9", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route489_Weather21", "v1/BlockedIntersection_Town12_Route836_Weather3", "v1/ParkingCutIn_Town12_Route900_Weather19", "v1/CrossingBicycleFlow_Town12_Route862_Weather3", "v1/OppositeVehicleTakingPriority_Town03_Route155_Weather25", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route869_Weather11", "v1/ParkingExit_Town12_Route1307_Weather20", "v1/ControlLoss_Town11_Route401_Weather11", "v1/AccidentTwoWays_Town12_Route1107_Weather15", "v1/InterurbanActorFlow_Town13_Route708_Weather6", "v1/VehicleTurningRoute_Town12_Route824_Weather20", "v1/ControlLoss_Town10HD_Route377_Weather13"], "val": ["v1/ParkingCrossingPedestrian_Town13_Route545_Weather25", "v1/OppositeVehicleTakingPriority_Town04_Route214_Weather6", "v1/DynamicObjectCrossing_Town02_Route11_Weather11", "v1/AccidentTwoWays_Town12_Route1115_Weather23", "v1/VehicleTurningRoute_Town15_Route504_Weather10", "v1/ParkingExit_Town12_Route922_Weather12", "v1/SignalizedJunctionLeftTurn_Town04_Route173_Weather26", "v1/EnterActorFlow_Town03_Route132_Weather2", "v1/HighwayExit_Town06_Route312_Weather0", "v1/VanillaSignalizedTurnEncounterRedLight_Town15_Route491_Weather23", "v1/CrossingBicycleFlow_Town12_Route977_Weather15", "v1/OppositeVehicleRunningRedLight_Town04_Route180_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town07_Route359_Weather21", "v1/ParkingCutIn_Town13_Route1343_Weather1", "v1/ParkedObstacle_Town06_Route282_Weather22", "v1/TJunction_Town06_Route306_Weather20", "v1/PedestrianCrossing_Town13_Route747_Weather19", "v1/VehicleTurningRoutePedestrian_Town15_Route445_Weather11", "v1/ConstructionObstacle_Town12_Route78_Weather0", "v1/HazardAtSideLaneTwoWays_Town12_Route1151_Weather7", "v1/ControlLoss_Town04_Route170_Weather14", "v1/MergerIntoSlowTrafficV2_Town12_Route857_Weather25", "v1/DynamicObjectCrossing_Town01_Route3_Weather3", "v1/SignalizedJunctionRightTurn_Town03_Route118_Weather14", "v1/BlockedIntersection_Town03_Route135_Weather5", "v1/MergerIntoSlowTraffic_Town06_Route317_Weather5", "v1/NonSignalizedJunctionRightTurn_Town03_Route126_Weather18", "v1/ParkedObstacleTwoWays_Town13_Route1333_Weather26", "v1/ConstructionObstacleTwoWays_Town12_Route1093_Weather1", "v1/TJunction_Town05_Route260_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town07_Route342_Weather3", "v1/HighwayCutIn_Town12_Route1029_Weather15", "v1/HazardAtSideLane_Town10HD_Route373_Weather9", "v1/YieldToEmergencyVehicle_Town04_Route166_Weather10", "v1/HardBreakRoute_Town01_Route32_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route657_Weather2", "v1/ConstructionObstacle_Town10HD_Route74_Weather22", "v1/ControlLoss_Town10HD_Route378_Weather14", "v1/Accident_Town05_Route218_Weather10", "v1/InterurbanActorFlow_Town12_Route1291_Weather1", "v1/LaneChange_Town06_Route307_Weather21", "v1/InvadingTurn_Town02_Route95_Weather9", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route979_Weather9", "v1/StaticCutIn_Town05_Route226_Weather18", "v1/VehicleOpensDoorTwoWays_Town12_Route1203_Weather7", "v1/VehicleTurningRoutePedestrian_Town15_Route481_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town07_Route354_Weather8", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route949_Weather13", "v1/InterurbanAdvancedActorFlow_Town06_Route324_Weather2", "v1/ParkedObstacle_Town10HD_Route372_Weather8"]}
\ No newline at end of file
diff --git a/docs/CONVERT_GUIDE.md b/docs/CONVERT_GUIDE.md
new file mode 100644
index 0000000..5b4c988
--- /dev/null
+++ b/docs/CONVERT_GUIDE.md
@@ -0,0 +1,29 @@
+# Code Convert Guide
+
+This document outlines important considerations for migrating code based on nuscenes or other datasets to bench2drive.
+
+## Models
+
+We integrated several MMCV dependencies into the `mmcv` directory and no longer install the original libraries. You can refer to our existing methods to utilize these modules and place your own models and utils in `mmcv` directory and register them. Please make sure the mmcv directory contains all the modules you need; if not, you will need to add them.
+
+## Scripts and configs
+
+You can place the configs and scripts for each method in the `adzoo` . Utils of each methods can also be placed here for easier management.
+
+## Details of configs
+
+To create a config for the bench2drive dataset, note the following:
+
+- We have included the bench2drive name-to-class mapping and evaluation settings directly in the config. You can use our settings or modify them as needed.
+- Unlike the 10 classes in nuscenes, we use 9 classes in bench2drive .
+- Methods like UniAD and VAD use 3 commands on nuscenes, while bench2drive uses 6 commands obtained from Carla.
+
+## Dataset
+
+- The reference frame of the Bench2Drive data differs significantly from the coordinate system used by Nuscenes.([here](https://github.com/Thinklab-SJTU/Bench2Drive/blob/main/docs/anno.md) for details). In `mmcv/datasets/prepare_B2D.py`, we convert the world coordinate system, ego coordinate system, and sensor coordinate system to match the Nuscenes reference frame, including the vehicle coordinates, bounding box coordinates, and sensor extrinsics. You can refer to our code for data alignment.
+- In Nuscenes, keyframes are at 2Hz, while Bench2Drive runs at 10Hz with annotations for each frame. For reproducing UniAD and VAD, we set the window length (time interval between adjacent points in past and future trajectories) to 0.5s and the window shift to 0.1s (any frame can be selected as the current frame). This fully utilizes Bench2Drive's data and aligns the trajectories with Nuscenes.
+- For the map, Bench2Drive stores vectorized maps. You can refer to our code to use the map, such as extracting map elements within a certain range.
+
+## Team agent
+
+To perform closed-loop evaluation in Carla, set up sensors to gather data from Carla. Use this data to compute all necessary model inputs, then convert the model outputs into a `carla.VehicleControl` object.
\ No newline at end of file
diff --git a/docs/DATA_PREP.md b/docs/DATA_PREP.md
new file mode 100644
index 0000000..066338d
--- /dev/null
+++ b/docs/DATA_PREP.md
@@ -0,0 +1,81 @@
+# Prepare Bench2Drive Dataset
+
+## Download Bench2Drive
+
+Download our dataset from (LINK) and make sure the structure of data as follows:
+
+```
+ Bench2DriveZoo
+ ├── ...
+ ├── data/
+ | ├── bench2drive/
+ | | ├── v1/ # Bench2Drive base
+ | | | ├── Accident_Town03_Route101_Weather23/
+ | | | ├── Accident_Town03_Route102_Weather20/
+ | | | └── ...
+ | | └── maps/ # maps of Towns
+ | | ├── Town01_HD_map.npz
+ | | ├── Town02_HD_map.npz
+ | | └── ...
+ | ├── others
+ | | └── b2d_motion_anchor_infos_mode6.pkl # motion anchors for UniAD
+ | └── splits
+ | └── bench2drive_base_train_val_split.json # trainval_split of Bench2Drive base
+
+```
+
+## Prepare Bench2Drive data info
+
+Run the following command:
+
+```
+cd mmcv/datasets
+python prepare_B2D.py --workers 16 # workers used to prepare data
+```
+
+The command will generate `b2d_infos_train.pkl`, `b2d_infos_val.pkl`, `b2d_map_infos.pkl` under `data/infos`.
+*Note: It will take about 1 hour to generate all the data with 16 workers*
+
+
+## Structure of code
+
+
+After installing and data preparing, the structure of our code will be as follows:
+
+```
+ Bench2DriveZoo
+ ├── adzoo/
+ | ├── bevformer/
+ | ├── uniad/
+ | └── vad/
+ ├── ckpts/
+ | ├── r101_dcn_fcos3d_pretrain.pth # pretrain weights for bevformer
+ | ├── resnet50-19c8e357.pth # image backbone pretrain weights for vad
+ | ├── bevformer_base_b2d.pth # download weights you need
+ | ├── uniad_base_b2d.pth # download weights you need
+ | └── ...
+ ├── data/
+ | ├── bench2drive/
+ | | ├── v1/ # Bench2Drive base
+ | | | ├── Accident_Town03_Route101_Weather23/
+ | | | ├── Accident_Town03_Route102_Weather20/
+ | | | └── ...
+ | | └── maps/ # maps of Towns
+ | | ├── Town01_HD_map.npz
+ | | ├── Town02_HD_map.npz
+ | | └── ...
+ │ ├── infos/
+ │ │ ├── b2d_infos_train.pkl
+ │ │ ├── b2d_infos_val.pkl
+ | | └── b2d_map_infos.pkl
+ | ├── others
+ | | └── b2d_motion_anchor_infos_mode6.pkl # motion anchors for UniAD
+ | └── splits
+ | └── bench2drive_base_train_val_split.json # trainval_split of Bench2Drive base
+ ├── docs/
+ ├── mmcv/
+ ├── team_code/ # for Closed-loop Evaluation in Carla
+```
+
+
+
diff --git a/docs/EVAL_IN_CARLA.md b/docs/EVAL_IN_CARLA.md
new file mode 100644
index 0000000..0d06c57
--- /dev/null
+++ b/docs/EVAL_IN_CARLA.md
@@ -0,0 +1,26 @@
+# Closed Loop Evaluation
+
+Please follow these steps to evaluate UniAD and VAD in Carla:
+
+## Preparations
+
+- Install this repo as [doc](docs/INSTALL.md).
+- Install Bench2Drive from [here](https://github.com/Thinklab-SJTU/Bench2Drive).
+
+
+## Link this repo to Bench2Drive
+
+```bash
+# Add your agent code
+cd Bench2Drive/leaderboard
+mkdir team_code
+cd Bench2Drive/leaderboard/team_code
+ln -s YOUR_TEAM_AGENT ./ # link your agent code
+cd Bench2Drive/
+ln -s Bench2DriveZoo/team_code/* ./ # link entire repo to Bench2Drive
+```
+
+## Run evaluation
+
+Follow [this](https://github.com/Thinklab-SJTU/Bench2Drive?tab=readme-ov-file#eval-tools) to use evaluation tools of Bench2Drive.
+
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
new file mode 100644
index 0000000..2e69655
--- /dev/null
+++ b/docs/INSTALL.md
@@ -0,0 +1,52 @@
+## Follow these steps to install the environment
+- **STEP 1: Create enviroment**
+ ```
+ conda create -n uniad python=3.8
+ conda activate uniad
+ ```
+- **STEP 2: Install cudatoolkit**
+ ```
+ conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit
+ ```
+- **STEP 3: Install torch**
+ ```
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+ ```
+- **STEP 4: Set environment variables**
+ ```
+ export PATH=YOUR_GCC_PATH/bin:$PATH
+ export CUDA_HOME=YOUR_CUDA_PATH/
+ ```
+- **STEP 5: Install ninja and packaging**
+ ```
+ pip install ninja packaging
+ ```
+- **STEP 6: Install our repo**
+ ```
+ pip install -v -e .
+ ```
+
+- **STEP 7: Prepare pretrained weights.**
+ create directory `ckpts`
+
+ ```
+ mkdir ckpts
+ ```
+ Download `resnet50-19c8e357.pth` form [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/resnet50-19c8e357.pth) or [Baidu Cloud](https://pan.baidu.com/s/1LlSrbYvghnv3lOlX1uLU5g?pwd=1234 )
+ Download `r101_dcn_fcos3d_pretrain.pth` form [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/r101_dcn_fcos3d_pretrain.pth) or [Baidu Cloud](https://pan.baidu.com/s/1o7owaQ5G66xqq2S0TldwXQ?pwd=1234)
+
+
+- **STEP 8: Install Carla for closed-loop evaluation.**
+
+
+ ```
+ mkdir carla
+ cd carla
+ wget https://carla-releases.s3.us-east-005.backblazeb2.com/Linux/CARLA_0.9.15.tar.gz
+ tar -xvf CARLA_0.9.15.tar.gz
+ cd Import && wget https://carla-releases.s3.us-east-005.backblazeb2.com/Linux/AdditionalMaps_0.9.15.tar.gz
+ cd .. && bash ImportAssets.sh
+ export CARLA_ROOT=YOUR_CARLA_PATH
+ echo "$CARLA_ROOT/PythonAPI/carla/dist/carla-0.9.15-py3.7-linux-x86_64.egg" >> YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV_NAME/lib/python3.7/site-packages/carla.pth # python 3.8 also works well, please set YOUR_CONDA_PATH and YOUR_CONDA_ENV_NAME
+
+ ```
\ No newline at end of file
diff --git a/docs/TRAIN_EVAL.md b/docs/TRAIN_EVAL.md
new file mode 100644
index 0000000..ce66ff0
--- /dev/null
+++ b/docs/TRAIN_EVAL.md
@@ -0,0 +1,68 @@
+# Train/Eval models
+
+You can use following commands to train and validate [BEVFormer](https://github.com/fundamentalvision/BEVFormer), [UniAD](https://github.com/OpenDriveLab/UniAD) and [VAD](https://github.com/hustvl/VAD)
+
+## BEVFormer
+
+### Train
+
+```bash
+#train BEVFormer base
+./adzoo/bevformer/dist_train.sh ./adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py 4 #N_GPUS
+#train BEVFormer tiny
+./adzoo/bevformer/dist_train.sh ./adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py 4 #N_GPUS
+```
+### Open loop eval
+
+```bash
+#eval BEVFormer base
+./adzoo/bevformer/dist_test.sh ./adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py ./ckpts/bevformer_base_b2d.pth 1
+#test BEVFormerr tiny
+./adzoo/bevformer/dist_test.sh ./adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py ./ckpts/bevformer_tiny_b2d.pth 1
+```
+
+
+## UniAD
+
+### Train stage1
+```bash
+#train UniAD base
+./adzoo/uniad/uniad_dist_train.sh ./adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py 4
+#train UniAD tiny
+./adzoo/uniad/uniad_dist_train.sh ./adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py 4
+```
+
+### Train stage2
+```bash
+#train UniAD base
+./adzoo/uniad/uniad_dist_train.sh ./adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py 1
+#train UniAD tiny
+./adzoo/uniad/uniad_dist_train.sh ./adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py 1
+```
+
+
+### Open loop eval
+
+```bash
+#eval UniAD base
+./adzoo/uniad/uniad_dist_eval.sh ./adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py ./ckpts/uniad_base_b2d.pth 1
+#eval UniAD tiny
+./adzoo/uniad/uniad_dist_eval.sh ./adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py ./ckpts/uniad_tiny_b2d.pth 1
+```
+
+
+## VAD
+
+### Train
+
+```bash
+./adzoo/vad/dist_test.sh ./adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py ./ckpts/vad_b2d_base.pth 1
+```
+
+### Open loop eval
+
+```bash
+./adzoo/vad/dist_test.sh ./adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py ./ckpts/vad_b2d_base.pth 1
+```
+
+**NOTE**: UniAD and VAD use different definitions to calculate Planning L2. UniAD calculates L2 at each time step(0.5s,1.0s,1.5s,...), while VAD calculates the average over each time period(0s-0.5s,0s-1.0s,0s-1.5s,...). We retain the original calculation logic in the code, but report UniAD's Planning L2 converted to VAD's definition.
\ No newline at end of file
diff --git a/mmcv/__init__.py b/mmcv/__init__.py
new file mode 100644
index 0000000..29f79b1
--- /dev/null
+++ b/mmcv/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+__version__ = '0.0.1'
+
+from .fileio import *
+from .image import *
+from .utils import *
+from .core.bbox.coder.nms_free_coder import NMSFreeCoder
+from .core.bbox.match_costs import BBox3DL1Cost, DiceCost
+from .core.evaluation.eval_hooks import CustomDistEvalHook
+from .models.utils import *
+from .models.opt.adamw import AdamW2
+from .losses import *
+from .structures import Instances, BoxMode, Boxes
+from .layers import cat, Conv2d, batched_nms, get_norm
\ No newline at end of file
diff --git a/mmcv/core/__init__.py b/mmcv/core/__init__.py
new file mode 100644
index 0000000..a401238
--- /dev/null
+++ b/mmcv/core/__init__.py
@@ -0,0 +1,10 @@
+from .anchor import * # noqa: F401, F403
+from .bbox import * # noqa: F401, F403
+from .evaluation import * # noqa: F401, F403
+from .points import * # noqa: F401, F403
+from .mask import * # noqa: F401, F403
+from .post_processing import * # noqa: F401, F403
+from .utils import * # noqa: F401, F403
+# from .seg import * # noqa: F401, F403
+from .visualizer import * # noqa: F401, F403
+from .voxel import * # noqa: F401, F403
diff --git a/mmcv/core/anchor/__init__.py b/mmcv/core/anchor/__init__.py
new file mode 100644
index 0000000..e3262a7
--- /dev/null
+++ b/mmcv/core/anchor/__init__.py
@@ -0,0 +1,18 @@
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+ YOLOAnchorGenerator)
+from .builder import (ANCHOR_GENERATORS, PRIOR_GENERATORS,
+ build_anchor_generator, build_prior_generator)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region, images_to_levels
+from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator,
+ AlignedAnchor3DRangeGeneratorPerCls,
+ Anchor3DRangeGenerator)
+
+__all__ = [
+ 'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+ 'PointGenerator', 'images_to_levels', 'calc_region',
+ 'build_anchor_generator', 'ANCHOR_GENERATORS', 'YOLOAnchorGenerator',
+ 'build_prior_generator', 'PRIOR_GENERATORS', 'MlvlPointGenerator',
+ 'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
+ 'AlignedAnchor3DRangeGeneratorPerCls'
+]
diff --git a/mmcv/core/anchor/anchor_3d_generator.py b/mmcv/core/anchor/anchor_3d_generator.py
new file mode 100644
index 0000000..118f6ea
--- /dev/null
+++ b/mmcv/core/anchor/anchor_3d_generator.py
@@ -0,0 +1,404 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.core.anchor import ANCHOR_GENERATORS
+from mmcv.utils import is_list_of
+
+
+@ANCHOR_GENERATORS.register_module()
+class Anchor3DRangeGenerator(object):
+ """3D Anchor Generator by range.
+
+ This anchor generator generates anchors by the given range in different
+ feature levels.
+ Due the convention in 3D detection, different anchor sizes are related to
+ different ranges for different categories. However we find this setting
+ does not effect the performance much in some datasets, e.g., nuScenes.
+
+ Args:
+ ranges (list[list[float]]): Ranges of different anchors.
+ The ranges are the same across different feature levels. But may
+ vary for different anchor sizes if size_per_range is True.
+ sizes (list[list[float]]): 3D sizes of anchors.
+ scales (list[int]): Scales of anchors in different feature levels.
+ rotations (list[float]): Rotations of anchors in a feature grid.
+ custom_values (tuple[float]): Customized values of that anchor. For
+ example, in nuScenes the anchors have velocities.
+ reshape_out (bool): Whether to reshape the output into (N x 4).
+ size_per_range: Whether to use separate ranges for different sizes.
+ If size_per_range is True, the ranges should have the same length
+ as the sizes, if not, it will be duplicated.
+ """
+
+ def __init__(self,
+ ranges,
+ sizes=[[1.6, 3.9, 1.56]],
+ scales=[1],
+ rotations=[0, 1.5707963],
+ custom_values=(),
+ reshape_out=True,
+ size_per_range=True):
+ assert is_list_of(ranges, list)
+ if size_per_range:
+ if len(sizes) != len(ranges):
+ assert len(ranges) == 1
+ ranges = ranges * len(sizes)
+ assert len(ranges) == len(sizes)
+ else:
+ assert len(ranges) == 1
+ assert is_list_of(sizes, list)
+ assert isinstance(scales, list)
+
+ self.sizes = sizes
+ self.scales = scales
+ self.ranges = ranges
+ self.rotations = rotations
+ self.custom_values = custom_values
+ self.cached_anchors = None
+ self.reshape_out = reshape_out
+ self.size_per_range = size_per_range
+
+ def __repr__(self):
+ s = self.__class__.__name__ + '('
+ s += f'anchor_range={self.ranges},\n'
+ s += f'scales={self.scales},\n'
+ s += f'sizes={self.sizes},\n'
+ s += f'rotations={self.rotations},\n'
+ s += f'reshape_out={self.reshape_out},\n'
+ s += f'size_per_range={self.size_per_range})'
+ return s
+
+ @property
+ def num_base_anchors(self):
+ """list[int]: Total number of base anchors in a feature grid."""
+ num_rot = len(self.rotations)
+ num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
+ return num_rot * num_size
+
+ @property
+ def num_levels(self):
+ """int: Number of feature levels that the generator is applied to."""
+ return len(self.scales)
+
+ def grid_anchors(self, featmap_sizes, device='cuda'):
+ """Generate grid anchors in multiple feature levels.
+
+ Args:
+ featmap_sizes (list[tuple]): List of feature map sizes in
+ multiple feature levels.
+ device (str): Device where the anchors will be put on.
+
+ Returns:
+ list[torch.Tensor]: Anchors in multiple feature levels. \
+ The sizes of each tensor should be [N, 4], where \
+ N = width * height * num_base_anchors, width and height \
+ are the sizes of the corresponding feature lavel, \
+ num_base_anchors is the number of anchors for that level.
+ """
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_anchors = []
+ for i in range(self.num_levels):
+ anchors = self.single_level_grid_anchors(
+ featmap_sizes[i], self.scales[i], device=device)
+ if self.reshape_out:
+ anchors = anchors.reshape(-1, anchors.size(-1))
+ multi_level_anchors.append(anchors)
+ return multi_level_anchors
+
+ def single_level_grid_anchors(self, featmap_size, scale, device='cuda'):
+ """Generate grid anchors of a single level feature map.
+
+ This function is usually called by method ``self.grid_anchors``.
+
+ Args:
+ featmap_size (tuple[int]): Size of the feature map.
+ scale (float): Scale factor of the anchors in the current level.
+ device (str, optional): Device the tensor will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: Anchors in the overall feature map.
+ """
+ # We reimplement the anchor generator using torch in cuda
+ # torch: 0.6975 s for 1000 times
+ # numpy: 4.3345 s for 1000 times
+ # which is ~5 times faster than the numpy implementation
+ if not self.size_per_range:
+ return self.anchors_single_range(
+ featmap_size,
+ self.ranges[0],
+ scale,
+ self.sizes,
+ self.rotations,
+ device=device)
+
+ mr_anchors = []
+ for anchor_range, anchor_size in zip(self.ranges, self.sizes):
+ mr_anchors.append(
+ self.anchors_single_range(
+ featmap_size,
+ anchor_range,
+ scale,
+ anchor_size,
+ self.rotations,
+ device=device))
+ mr_anchors = torch.cat(mr_anchors, dim=-3)
+ return mr_anchors
+
+ def anchors_single_range(self,
+ feature_size,
+ anchor_range,
+ scale=1,
+ sizes=[[1.6, 3.9, 1.56]],
+ rotations=[0, 1.5707963],
+ device='cuda'):
+ """Generate anchors in a single range.
+
+ Args:
+ feature_size (list[float] | tuple[float]): Feature map size. It is
+ either a list of a tuple of [D, H, W](in order of z, y, and x).
+ anchor_range (torch.Tensor | list[float]): Range of anchors with
+ shape [6]. The order is consistent with that of anchors, i.e.,
+ (x_min, y_min, z_min, x_max, y_max, z_max).
+ scale (float | int, optional): The scale factor of anchors.
+ sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
+ shape [N, 3], in order of x, y, z.
+ rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
+ anchors in a single feature grid.
+ device (str): Devices that the anchors will be put on.
+
+ Returns:
+ torch.Tensor: Anchors with shape \
+ [*feature_size, num_sizes, num_rots, 7].
+ """
+ if len(feature_size) == 2:
+ feature_size = [1, feature_size[0], feature_size[1]]
+ anchor_range = torch.tensor(anchor_range, device=device)
+ z_centers = torch.linspace(
+ anchor_range[2], anchor_range[5], feature_size[0], device=device)
+ y_centers = torch.linspace(
+ anchor_range[1], anchor_range[4], feature_size[1], device=device)
+ x_centers = torch.linspace(
+ anchor_range[0], anchor_range[3], feature_size[2], device=device)
+ sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+ rotations = torch.tensor(rotations, device=device)
+
+ # torch.meshgrid default behavior is 'id', np's default is 'xy'
+ rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
+ # torch.meshgrid returns a tuple rather than list
+ rets = list(rets)
+ tile_shape = [1] * 5
+ tile_shape[-2] = int(sizes.shape[0])
+ for i in range(len(rets)):
+ rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+ sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+ tile_size_shape = list(rets[0].shape)
+ tile_size_shape[3] = 1
+ sizes = sizes.repeat(tile_size_shape)
+ rets.insert(3, sizes)
+
+ ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+ # [1, 200, 176, N, 2, 7] for kitti after permute
+
+ if len(self.custom_values) > 0:
+ custom_ndim = len(self.custom_values)
+ custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+ # custom[:] = self.custom_values
+ ret = torch.cat([ret, custom], dim=-1)
+ # [1, 200, 176, N, 2, 9] for nus dataset after permute
+ return ret
+
+
+@ANCHOR_GENERATORS.register_module()
+class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator):
+ """Aligned 3D Anchor Generator by range.
+
+ This anchor generator uses a different manner to generate the positions
+ of anchors' centers from :class:`Anchor3DRangeGenerator`.
+
+ Note:
+ The `align` means that the anchor's center is aligned with the voxel
+ grid, which is also the feature grid. The previous implementation of
+ :class:`Anchor3DRangeGenerator` does not generate the anchors' center
+ according to the voxel grid. Rather, it generates the center by
+ uniformly distributing the anchors inside the minimum and maximum
+ anchor ranges according to the feature map sizes.
+ However, this makes the anchors center does not match the feature grid.
+ The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the
+ feature map sizes to obtain the corners of the voxel grid. Then it
+ shifts the coordinates to the center of voxel grid and use the left
+ up corner to distribute anchors.
+
+ Args:
+ anchor_corner (bool): Whether to align with the corner of the voxel
+ grid. By default it is False and the anchor's center will be
+ the same as the corresponding voxel's center, which is also the
+ center of the corresponding greature grid.
+ """
+
+ def __init__(self, align_corner=False, **kwargs):
+ super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs)
+ self.align_corner = align_corner
+
+ def anchors_single_range(self,
+ feature_size,
+ anchor_range,
+ scale,
+ sizes=[[1.6, 3.9, 1.56]],
+ rotations=[0, 1.5707963],
+ device='cuda'):
+ """Generate anchors in a single range.
+
+ Args:
+ feature_size (list[float] | tuple[float]): Feature map size. It is
+ either a list of a tuple of [D, H, W](in order of z, y, and x).
+ anchor_range (torch.Tensor | list[float]): Range of anchors with
+ shape [6]. The order is consistent with that of anchors, i.e.,
+ (x_min, y_min, z_min, x_max, y_max, z_max).
+ scale (float | int, optional): The scale factor of anchors.
+ sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
+ shape [N, 3], in order of x, y, z.
+ rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
+ anchors in a single feature grid.
+ device (str): Devices that the anchors will be put on.
+
+ Returns:
+ torch.Tensor: Anchors with shape \
+ [*feature_size, num_sizes, num_rots, 7].
+ """
+ if len(feature_size) == 2:
+ feature_size = [1, feature_size[0], feature_size[1]]
+ anchor_range = torch.tensor(anchor_range, device=device)
+ z_centers = torch.linspace(
+ anchor_range[2],
+ anchor_range[5],
+ feature_size[0] + 1,
+ device=device)
+ y_centers = torch.linspace(
+ anchor_range[1],
+ anchor_range[4],
+ feature_size[1] + 1,
+ device=device)
+ x_centers = torch.linspace(
+ anchor_range[0],
+ anchor_range[3],
+ feature_size[2] + 1,
+ device=device)
+ sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+ rotations = torch.tensor(rotations, device=device)
+
+ # shift the anchor center
+ if not self.align_corner:
+ z_shift = (z_centers[1] - z_centers[0]) / 2
+ y_shift = (y_centers[1] - y_centers[0]) / 2
+ x_shift = (x_centers[1] - x_centers[0]) / 2
+ z_centers += z_shift
+ y_centers += y_shift
+ x_centers += x_shift
+
+ # torch.meshgrid default behavior is 'id', np's default is 'xy'
+ rets = torch.meshgrid(x_centers[:feature_size[2]],
+ y_centers[:feature_size[1]],
+ z_centers[:feature_size[0]], rotations)
+
+ # torch.meshgrid returns a tuple rather than list
+ rets = list(rets)
+ tile_shape = [1] * 5
+ tile_shape[-2] = int(sizes.shape[0])
+ for i in range(len(rets)):
+ rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+ sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+ tile_size_shape = list(rets[0].shape)
+ tile_size_shape[3] = 1
+ sizes = sizes.repeat(tile_size_shape)
+ rets.insert(3, sizes)
+
+ ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+
+ if len(self.custom_values) > 0:
+ custom_ndim = len(self.custom_values)
+ custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+ # TODO: check the support of custom values
+ # custom[:] = self.custom_values
+ ret = torch.cat([ret, custom], dim=-1)
+ return ret
+
+
+@ANCHOR_GENERATORS.register_module()
+class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator):
+ """3D Anchor Generator by range for per class.
+
+ This anchor generator generates anchors by the given range for per class.
+ Note that feature maps of different classes may be different.
+
+ Args:
+ kwargs (dict): Arguments are the same as those in \
+ :class:`AlignedAnchor3DRangeGenerator`.
+ """
+
+ def __init__(self, **kwargs):
+ super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs)
+ assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \
+ ' not supported currently in this kind of anchor generator.'
+
+ def grid_anchors(self, featmap_sizes, device='cuda'):
+ """Generate grid anchors in multiple feature levels.
+
+ Args:
+ featmap_sizes (list[tuple]): List of feature map sizes for \
+ different classes in a single feature level.
+ device (str): Device where the anchors will be put on.
+
+ Returns:
+ list[list[torch.Tensor]]: Anchors in multiple feature levels. \
+ Note that in this anchor generator, we currently only \
+ support single feature level. The sizes of each tensor \
+ should be [num_sizes/ranges*num_rots*featmap_size, \
+ box_code_size].
+ """
+ multi_level_anchors = []
+ anchors = self.multi_cls_grid_anchors(
+ featmap_sizes, self.scales[0], device=device)
+ multi_level_anchors.append(anchors)
+ return multi_level_anchors
+
+ def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'):
+ """Generate grid anchors of a single level feature map for multi-class
+ with different feature map sizes.
+
+ This function is usually called by method ``self.grid_anchors``.
+
+ Args:
+ featmap_sizes (list[tuple]): List of feature map sizes for \
+ different classes in a single feature level.
+ scale (float): Scale factor of the anchors in the current level.
+ device (str, optional): Device the tensor will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: Anchors in the overall feature map.
+ """
+ assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \
+ 'The number of different feature map sizes anchor sizes and ' + \
+ 'ranges should be the same.'
+
+ multi_cls_anchors = []
+ for i in range(len(featmap_sizes)):
+ anchors = self.anchors_single_range(
+ featmap_sizes[i],
+ self.ranges[i],
+ scale,
+ self.sizes[i],
+ self.rotations,
+ device=device)
+ # [*featmap_size, num_sizes/ranges, num_rots, box_code_size]
+ ndim = len(featmap_sizes[i])
+ anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1))
+ # [*featmap_size, num_sizes/ranges*num_rots, box_code_size]
+ anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1)
+ # [num_sizes/ranges*num_rots, *featmap_size, box_code_size]
+ multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1)))
+ # [num_sizes/ranges*num_rots*featmap_size, box_code_size]
+ return multi_cls_anchors
diff --git a/mmcv/core/anchor/anchor_generator.py b/mmcv/core/anchor/anchor_generator.py
new file mode 100644
index 0000000..2b8c7d8
--- /dev/null
+++ b/mmcv/core/anchor/anchor_generator.py
@@ -0,0 +1,838 @@
+import warnings
+
+from mmcv.utils import is_tuple_of
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class AnchorGenerator:
+ """Standard anchor generator for 2D anchor-based detectors.
+
+ Args:
+ strides (list[int] | list[tuple[int, int]]): Strides of anchors
+ in multiple feature levels in order (w, h).
+ ratios (list[float]): The list of ratios between the height and width
+ of anchors in a single level.
+ scales (list[int] | None): Anchor scales for anchors in a single level.
+ It cannot be set at the same time if `octave_base_scale` and
+ `scales_per_octave` are set.
+ base_sizes (list[int] | None): The basic sizes
+ of anchors in multiple levels.
+ If None is given, strides will be used as base_sizes.
+ (If strides are non square, the shortest stride is taken.)
+ scale_major (bool): Whether to multiply scales first when generating
+ base anchors. If true, the anchors in the same row will have the
+ same scales. By default it is True in V2.0
+ octave_base_scale (int): The base scale of octave.
+ scales_per_octave (int): Number of scales for each octave.
+ `octave_base_scale` and `scales_per_octave` are usually used in
+ retinanet and the `scales` should be None when they are set.
+ centers (list[tuple[float, float]] | None): The centers of the anchor
+ relative to the feature grid center in multiple feature levels.
+ By default it is set to be None and not used. If a list of tuple of
+ float is given, they will be used to shift the centers of anchors.
+ center_offset (float): The offset of center in proportion to anchors'
+ width and height. By default it is 0 in V2.0.
+
+ Examples:
+ >>> from mmcv.core import AnchorGenerator
+ >>> self = AnchorGenerator([16], [1.], [1.], [9])
+ >>> all_anchors = self.grid_anchors([(2, 2)], device='cpu')
+ >>> print(all_anchors)
+ [tensor([[-4.5000, -4.5000, 4.5000, 4.5000],
+ [11.5000, -4.5000, 20.5000, 4.5000],
+ [-4.5000, 11.5000, 4.5000, 20.5000],
+ [11.5000, 11.5000, 20.5000, 20.5000]])]
+ >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+ >>> all_anchors = self.grid_anchors([(2, 2), (1, 1)], device='cpu')
+ >>> print(all_anchors)
+ [tensor([[-4.5000, -4.5000, 4.5000, 4.5000],
+ [11.5000, -4.5000, 20.5000, 4.5000],
+ [-4.5000, 11.5000, 4.5000, 20.5000],
+ [11.5000, 11.5000, 20.5000, 20.5000]]), \
+ tensor([[-9., -9., 9., 9.]])]
+ """
+
+ def __init__(self,
+ strides,
+ ratios,
+ scales=None,
+ base_sizes=None,
+ scale_major=True,
+ octave_base_scale=None,
+ scales_per_octave=None,
+ centers=None,
+ center_offset=0.):
+ # check center and center_offset
+ if center_offset != 0:
+ assert centers is None, 'center cannot be set when center_offset' \
+ f'!=0, {centers} is given.'
+ if not (0 <= center_offset <= 1):
+ raise ValueError('center_offset should be in range [0, 1], '
+ f'{center_offset} is given.')
+ if centers is not None:
+ assert len(centers) == len(strides), \
+ 'The number of strides should be the same as centers, got ' \
+ f'{strides} and {centers}'
+
+ # calculate base sizes of anchors
+ self.strides = [_pair(stride) for stride in strides]
+ self.base_sizes = [min(stride) for stride in self.strides
+ ] if base_sizes is None else base_sizes
+ assert len(self.base_sizes) == len(self.strides), \
+ 'The number of strides should be the same as base sizes, got ' \
+ f'{self.strides} and {self.base_sizes}'
+
+ # calculate scales of anchors
+ assert ((octave_base_scale is not None
+ and scales_per_octave is not None) ^ (scales is not None)), \
+ 'scales and octave_base_scale with scales_per_octave cannot' \
+ ' be set at the same time'
+ if scales is not None:
+ self.scales = torch.Tensor(scales)
+ elif octave_base_scale is not None and scales_per_octave is not None:
+ octave_scales = np.array(
+ [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+ scales = octave_scales * octave_base_scale
+ self.scales = torch.Tensor(scales)
+ else:
+ raise ValueError('Either scales or octave_base_scale with '
+ 'scales_per_octave should be set')
+
+ self.octave_base_scale = octave_base_scale
+ self.scales_per_octave = scales_per_octave
+ self.ratios = torch.Tensor(ratios)
+ self.scale_major = scale_major
+ self.centers = centers
+ self.center_offset = center_offset
+ self.base_anchors = self.gen_base_anchors()
+
+ @property
+ def num_base_anchors(self):
+ """list[int]: total number of base anchors in a feature grid"""
+ return self.num_base_priors
+
+ @property
+ def num_base_priors(self):
+ """list[int]: The number of priors (anchors) at a point
+ on the feature grid"""
+ return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+ @property
+ def num_levels(self):
+ """int: number of feature levels that the generator will be applied"""
+ return len(self.strides)
+
+ def gen_base_anchors(self):
+ """Generate base anchors.
+
+ Returns:
+ list(torch.Tensor): Base anchors of a feature grid in multiple \
+ feature levels.
+ """
+ multi_level_base_anchors = []
+ for i, base_size in enumerate(self.base_sizes):
+ center = None
+ if self.centers is not None:
+ center = self.centers[i]
+ multi_level_base_anchors.append(
+ self.gen_single_level_base_anchors(
+ base_size,
+ scales=self.scales,
+ ratios=self.ratios,
+ center=center))
+ return multi_level_base_anchors
+
+ def gen_single_level_base_anchors(self,
+ base_size,
+ scales,
+ ratios,
+ center=None):
+ """Generate base anchors of a single level.
+
+ Args:
+ base_size (int | float): Basic size of an anchor.
+ scales (torch.Tensor): Scales of the anchor.
+ ratios (torch.Tensor): The ratio between between the height
+ and width of anchors in a single level.
+ center (tuple[float], optional): The center of the base anchor
+ related to a single feature grid. Defaults to None.
+
+ Returns:
+ torch.Tensor: Anchors in a single-level feature maps.
+ """
+ w = base_size
+ h = base_size
+ if center is None:
+ x_center = self.center_offset * w
+ y_center = self.center_offset * h
+ else:
+ x_center, y_center = center
+
+ h_ratios = torch.sqrt(ratios)
+ w_ratios = 1 / h_ratios
+ if self.scale_major:
+ ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+ hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+ else:
+ ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+ hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+ # use float anchor and the anchor's center is aligned with the
+ # pixel center
+ base_anchors = [
+ x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+ y_center + 0.5 * hs
+ ]
+ base_anchors = torch.stack(base_anchors, dim=-1)
+
+ return base_anchors
+
+ def _meshgrid(self, x, y, row_major=True):
+ """Generate mesh grid of x and y.
+
+ Args:
+ x (torch.Tensor): Grids of x dimension.
+ y (torch.Tensor): Grids of y dimension.
+ row_major (bool, optional): Whether to return y grids first.
+ Defaults to True.
+
+ Returns:
+ tuple[torch.Tensor]: The mesh grids of x and y.
+ """
+ # use shape instead of len to keep tracing while exporting to onnx
+ xx = x.repeat(y.shape[0])
+ yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+ if row_major:
+ return xx, yy
+ else:
+ return yy, xx
+
+ def grid_priors(self, featmap_sizes, device='cuda'):
+ """Generate grid anchors in multiple feature levels.
+
+ Args:
+ featmap_sizes (list[tuple]): List of feature map sizes in
+ multiple feature levels.
+ device (str): The device where the anchors will be put on.
+
+ Return:
+ list[torch.Tensor]: Anchors in multiple feature levels. \
+ The sizes of each tensor should be [N, 4], where \
+ N = width * height * num_base_anchors, width and height \
+ are the sizes of the corresponding feature level, \
+ num_base_anchors is the number of anchors for that level.
+ """
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_anchors = []
+ for i in range(self.num_levels):
+ anchors = self.single_level_grid_priors(
+ featmap_sizes[i], level_idx=i, device=device)
+ multi_level_anchors.append(anchors)
+ return multi_level_anchors
+
+ def single_level_grid_priors(self, featmap_size, level_idx, device='cuda'):
+ """Generate grid anchors of a single level.
+
+ Note:
+ This function is usually called by method ``self.grid_priors``.
+
+ Args:
+ featmap_size (tuple[int]): Size of the feature maps.
+ level_idx (int): The index of corresponding feature map level.
+ device (str, optional): The device the tensor will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: Anchors in the overall feature maps.
+ """
+
+ base_anchors = self.base_anchors[level_idx].to(device)
+ feat_h, feat_w = featmap_size
+ stride_w, stride_h = self.strides[level_idx]
+ shift_x = torch.arange(0, feat_w, device=device) * stride_w
+ shift_y = torch.arange(0, feat_h, device=device) * stride_h
+
+ shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+ shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+ shifts = shifts.type_as(base_anchors)
+ # first feat_w elements correspond to the first row of shifts
+ # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+ # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+ all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+ all_anchors = all_anchors.view(-1, 4)
+ # first A rows correspond to A anchors of (0, 0) in feature map,
+ # then (0, 1), (0, 2), ...
+ return all_anchors
+
+ def sparse_priors(self,
+ prior_idxs,
+ featmap_size,
+ level_idx,
+ dtype=torch.float32,
+ device='cuda'):
+ """Generate sparse anchors according to the ``prior_idxs``.
+
+ Args:
+ prior_idxs (Tensor): The index of corresponding anchors
+ in the feature map.
+ featmap_size (tuple[int]): feature map size arrange as (h, w).
+ level_idx (int): The level index of corresponding feature
+ map.
+ dtype (obj:`torch.dtype`): Date type of points.Defaults to
+ ``torch.float32``.
+ device (obj:`torch.device`): The device where the points is
+ located.
+ Returns:
+ Tensor: Anchor with shape (N, 4), N should be equal to
+ the length of ``prior_idxs``.
+ """
+
+ height, width = featmap_size
+ num_base_anchors = self.num_base_anchors[level_idx]
+ base_anchor_id = prior_idxs % num_base_anchors
+ x = (prior_idxs //
+ num_base_anchors) % width * self.strides[level_idx][0]
+ y = (prior_idxs // width //
+ num_base_anchors) % height * self.strides[level_idx][1]
+ priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \
+ self.base_anchors[level_idx][base_anchor_id, :].to(device)
+
+ return priors
+
+ def grid_anchors(self, featmap_sizes, device='cuda'):
+ """Generate grid anchors in multiple feature levels.
+
+ Args:
+ featmap_sizes (list[tuple]): List of feature map sizes in
+ multiple feature levels.
+ device (str): Device where the anchors will be put on.
+
+ Return:
+ list[torch.Tensor]: Anchors in multiple feature levels. \
+ The sizes of each tensor should be [N, 4], where \
+ N = width * height * num_base_anchors, width and height \
+ are the sizes of the corresponding feature level, \
+ num_base_anchors is the number of anchors for that level.
+ """
+ warnings.warn('``grid_anchors`` would be deprecated soon. '
+ 'Please use ``grid_priors`` ')
+
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_anchors = []
+ for i in range(self.num_levels):
+ anchors = self.single_level_grid_anchors(
+ self.base_anchors[i].to(device),
+ featmap_sizes[i],
+ self.strides[i],
+ device=device)
+ multi_level_anchors.append(anchors)
+ return multi_level_anchors
+
+ def single_level_grid_anchors(self,
+ base_anchors,
+ featmap_size,
+ stride=(16, 16),
+ device='cuda'):
+ """Generate grid anchors of a single level.
+
+ Note:
+ This function is usually called by method ``self.grid_anchors``.
+
+ Args:
+ base_anchors (torch.Tensor): The base anchors of a feature grid.
+ featmap_size (tuple[int]): Size of the feature maps.
+ stride (tuple[int], optional): Stride of the feature map in order
+ (w, h). Defaults to (16, 16).
+ device (str, optional): Device the tensor will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: Anchors in the overall feature maps.
+ """
+
+ warnings.warn(
+ '``single_level_grid_anchors`` would be deprecated soon. '
+ 'Please use ``single_level_grid_priors`` ')
+
+ # keep featmap_size as Tensor instead of int, so that we
+ # can covert to ONNX correctly
+ feat_h, feat_w = featmap_size
+ shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+ shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+ shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+ shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+ shifts = shifts.type_as(base_anchors)
+ # first feat_w elements correspond to the first row of shifts
+ # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+ # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+ all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+ all_anchors = all_anchors.view(-1, 4)
+ # first A rows correspond to A anchors of (0, 0) in feature map,
+ # then (0, 1), (0, 2), ...
+ return all_anchors
+
+ def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+ """Generate valid flags of anchors in multiple feature levels.
+
+ Args:
+ featmap_sizes (list(tuple)): List of feature map sizes in
+ multiple feature levels.
+ pad_shape (tuple): The padded shape of the image.
+ device (str): Device where the anchors will be put on.
+
+ Return:
+ list(torch.Tensor): Valid flags of anchors in multiple levels.
+ """
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_flags = []
+ for i in range(self.num_levels):
+ anchor_stride = self.strides[i]
+ feat_h, feat_w = featmap_sizes[i]
+ h, w = pad_shape[:2]
+ valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+ valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+ flags = self.single_level_valid_flags((feat_h, feat_w),
+ (valid_feat_h, valid_feat_w),
+ self.num_base_anchors[i],
+ device=device)
+ multi_level_flags.append(flags)
+ return multi_level_flags
+
+ def single_level_valid_flags(self,
+ featmap_size,
+ valid_size,
+ num_base_anchors,
+ device='cuda'):
+ """Generate the valid flags of anchor in a single feature map.
+
+ Args:
+ featmap_size (tuple[int]): The size of feature maps, arrange
+ as (h, w).
+ valid_size (tuple[int]): The valid size of the feature maps.
+ num_base_anchors (int): The number of base anchors.
+ device (str, optional): Device where the flags will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: The valid flags of each anchor in a single level \
+ feature map.
+ """
+ feat_h, feat_w = featmap_size
+ valid_h, valid_w = valid_size
+ assert valid_h <= feat_h and valid_w <= feat_w
+ valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+ valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+ valid_x[:valid_w] = 1
+ valid_y[:valid_h] = 1
+ valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+ valid = valid_xx & valid_yy
+ valid = valid[:, None].expand(valid.size(0),
+ num_base_anchors).contiguous().view(-1)
+ return valid
+
+ def __repr__(self):
+ """str: a string that describes the module"""
+ indent_str = ' '
+ repr_str = self.__class__.__name__ + '(\n'
+ repr_str += f'{indent_str}strides={self.strides},\n'
+ repr_str += f'{indent_str}ratios={self.ratios},\n'
+ repr_str += f'{indent_str}scales={self.scales},\n'
+ repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+ repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+ repr_str += f'{indent_str}octave_base_scale='
+ repr_str += f'{self.octave_base_scale},\n'
+ repr_str += f'{indent_str}scales_per_octave='
+ repr_str += f'{self.scales_per_octave},\n'
+ repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+ repr_str += f'{indent_str}centers={self.centers},\n'
+ repr_str += f'{indent_str}center_offset={self.center_offset})'
+ return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+ """Anchor generator for SSD.
+
+ Args:
+ strides (list[int] | list[tuple[int, int]]): Strides of anchors
+ in multiple feature levels.
+ ratios (list[float]): The list of ratios between the height and width
+ of anchors in a single level.
+ basesize_ratio_range (tuple(float)): Ratio range of anchors.
+ input_size (int): Size of feature map, 300 for SSD300,
+ 512 for SSD512.
+ scale_major (bool): Whether to multiply scales first when generating
+ base anchors. If true, the anchors in the same row will have the
+ same scales. It is always set to be False in SSD.
+ """
+
+ def __init__(self,
+ strides,
+ ratios,
+ basesize_ratio_range,
+ input_size=300,
+ scale_major=True):
+ assert len(strides) == len(ratios)
+ assert is_tuple_of(basesize_ratio_range, float)
+
+ self.strides = [_pair(stride) for stride in strides]
+ self.input_size = input_size
+ self.centers = [(stride[0] / 2., stride[1] / 2.)
+ for stride in self.strides]
+ self.basesize_ratio_range = basesize_ratio_range
+
+ # calculate anchor ratios and sizes
+ min_ratio, max_ratio = basesize_ratio_range
+ min_ratio = int(min_ratio * 100)
+ max_ratio = int(max_ratio * 100)
+ step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+ min_sizes = []
+ max_sizes = []
+ for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+ min_sizes.append(int(self.input_size * ratio / 100))
+ max_sizes.append(int(self.input_size * (ratio + step) / 100))
+ if self.input_size == 300:
+ if basesize_ratio_range[0] == 0.15: # SSD300 COCO
+ min_sizes.insert(0, int(self.input_size * 7 / 100))
+ max_sizes.insert(0, int(self.input_size * 15 / 100))
+ elif basesize_ratio_range[0] == 0.2: # SSD300 VOC
+ min_sizes.insert(0, int(self.input_size * 10 / 100))
+ max_sizes.insert(0, int(self.input_size * 20 / 100))
+ else:
+ raise ValueError(
+ 'basesize_ratio_range[0] should be either 0.15'
+ 'or 0.2 when input_size is 300, got '
+ f'{basesize_ratio_range[0]}.')
+ elif self.input_size == 512:
+ if basesize_ratio_range[0] == 0.1: # SSD512 COCO
+ min_sizes.insert(0, int(self.input_size * 4 / 100))
+ max_sizes.insert(0, int(self.input_size * 10 / 100))
+ elif basesize_ratio_range[0] == 0.15: # SSD512 VOC
+ min_sizes.insert(0, int(self.input_size * 7 / 100))
+ max_sizes.insert(0, int(self.input_size * 15 / 100))
+ else:
+ raise ValueError('basesize_ratio_range[0] should be either 0.1'
+ 'or 0.15 when input_size is 512, got'
+ f' {basesize_ratio_range[0]}.')
+ else:
+ raise ValueError('Only support 300 or 512 in SSDAnchorGenerator'
+ f', got {self.input_size}.')
+
+ anchor_ratios = []
+ anchor_scales = []
+ for k in range(len(self.strides)):
+ scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+ anchor_ratio = [1.]
+ for r in ratios[k]:
+ anchor_ratio += [1 / r, r] # 4 or 6 ratio
+ anchor_ratios.append(torch.Tensor(anchor_ratio))
+ anchor_scales.append(torch.Tensor(scales))
+
+ self.base_sizes = min_sizes
+ self.scales = anchor_scales
+ self.ratios = anchor_ratios
+ self.scale_major = scale_major
+ self.center_offset = 0
+ self.base_anchors = self.gen_base_anchors()
+
+ def gen_base_anchors(self):
+ """Generate base anchors.
+
+ Returns:
+ list(torch.Tensor): Base anchors of a feature grid in multiple \
+ feature levels.
+ """
+ multi_level_base_anchors = []
+ for i, base_size in enumerate(self.base_sizes):
+ base_anchors = self.gen_single_level_base_anchors(
+ base_size,
+ scales=self.scales[i],
+ ratios=self.ratios[i],
+ center=self.centers[i])
+ indices = list(range(len(self.ratios[i])))
+ indices.insert(1, len(indices))
+ base_anchors = torch.index_select(base_anchors, 0,
+ torch.LongTensor(indices))
+ multi_level_base_anchors.append(base_anchors)
+ return multi_level_base_anchors
+
+ def __repr__(self):
+ """str: a string that describes the module"""
+ indent_str = ' '
+ repr_str = self.__class__.__name__ + '(\n'
+ repr_str += f'{indent_str}strides={self.strides},\n'
+ repr_str += f'{indent_str}scales={self.scales},\n'
+ repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+ repr_str += f'{indent_str}input_size={self.input_size},\n'
+ repr_str += f'{indent_str}scales={self.scales},\n'
+ repr_str += f'{indent_str}ratios={self.ratios},\n'
+ repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+ repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+ repr_str += f'{indent_str}basesize_ratio_range='
+ repr_str += f'{self.basesize_ratio_range})'
+ return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+ """Legacy anchor generator used in MMDetection V1.x.
+
+ Note:
+ Difference to the V2.0 anchor generator:
+
+ 1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+ 2. The width/height are minused by 1 when calculating the anchors' \
+ centers and corners to meet the V1.x coordinate system.
+ 3. The anchors' corners are quantized.
+
+ Args:
+ strides (list[int] | list[tuple[int]]): Strides of anchors
+ in multiple feature levels.
+ ratios (list[float]): The list of ratios between the height and width
+ of anchors in a single level.
+ scales (list[int] | None): Anchor scales for anchors in a single level.
+ It cannot be set at the same time if `octave_base_scale` and
+ `scales_per_octave` are set.
+ base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+ If None is given, strides will be used to generate base_sizes.
+ scale_major (bool): Whether to multiply scales first when generating
+ base anchors. If true, the anchors in the same row will have the
+ same scales. By default it is True in V2.0
+ octave_base_scale (int): The base scale of octave.
+ scales_per_octave (int): Number of scales for each octave.
+ `octave_base_scale` and `scales_per_octave` are usually used in
+ retinanet and the `scales` should be None when they are set.
+ centers (list[tuple[float, float]] | None): The centers of the anchor
+ relative to the feature grid center in multiple feature levels.
+ By default it is set to be None and not used. It a list of float
+ is given, this list will be used to shift the centers of anchors.
+ center_offset (float): The offset of center in propotion to anchors'
+ width and height. By default it is 0.5 in V2.0 but it should be 0.5
+ in v1.x models.
+
+ Examples:
+ >>> from mmcv.core import LegacyAnchorGenerator
+ >>> self = LegacyAnchorGenerator(
+ >>> [16], [1.], [1.], [9], center_offset=0.5)
+ >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+ >>> print(all_anchors)
+ [tensor([[ 0., 0., 8., 8.],
+ [16., 0., 24., 8.],
+ [ 0., 16., 8., 24.],
+ [16., 16., 24., 24.]])]
+ """
+
+ def gen_single_level_base_anchors(self,
+ base_size,
+ scales,
+ ratios,
+ center=None):
+ """Generate base anchors of a single level.
+
+ Note:
+ The width/height of anchors are minused by 1 when calculating \
+ the centers and corners to meet the V1.x coordinate system.
+
+ Args:
+ base_size (int | float): Basic size of an anchor.
+ scales (torch.Tensor): Scales of the anchor.
+ ratios (torch.Tensor): The ratio between between the height.
+ and width of anchors in a single level.
+ center (tuple[float], optional): The center of the base anchor
+ related to a single feature grid. Defaults to None.
+
+ Returns:
+ torch.Tensor: Anchors in a single-level feature map.
+ """
+ w = base_size
+ h = base_size
+ if center is None:
+ x_center = self.center_offset * (w - 1)
+ y_center = self.center_offset * (h - 1)
+ else:
+ x_center, y_center = center
+
+ h_ratios = torch.sqrt(ratios)
+ w_ratios = 1 / h_ratios
+ if self.scale_major:
+ ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+ hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+ else:
+ ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+ hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+ # use float anchor and the anchor's center is aligned with the
+ # pixel center
+ base_anchors = [
+ x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+ x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+ ]
+ base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+ return base_anchors
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+ """Legacy anchor generator used in MMDetection V1.x.
+
+ The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+ can be found in `LegacyAnchorGenerator`.
+ """
+
+ def __init__(self,
+ strides,
+ ratios,
+ basesize_ratio_range,
+ input_size=300,
+ scale_major=True):
+ super(LegacySSDAnchorGenerator,
+ self).__init__(strides, ratios, basesize_ratio_range, input_size,
+ scale_major)
+ self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+ for stride in strides]
+ self.base_anchors = self.gen_base_anchors()
+
+
+@PRIOR_GENERATORS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+ """Anchor generator for YOLO.
+
+ Args:
+ strides (list[int] | list[tuple[int, int]]): Strides of anchors
+ in multiple feature levels.
+ base_sizes (list[list[tuple[int, int]]]): The basic sizes
+ of anchors in multiple levels.
+ """
+
+ def __init__(self, strides, base_sizes):
+ self.strides = [_pair(stride) for stride in strides]
+ self.centers = [(stride[0] / 2., stride[1] / 2.)
+ for stride in self.strides]
+ self.base_sizes = []
+ num_anchor_per_level = len(base_sizes[0])
+ for base_sizes_per_level in base_sizes:
+ assert num_anchor_per_level == len(base_sizes_per_level)
+ self.base_sizes.append(
+ [_pair(base_size) for base_size in base_sizes_per_level])
+ self.base_anchors = self.gen_base_anchors()
+
+ @property
+ def num_levels(self):
+ """int: number of feature levels that the generator will be applied"""
+ return len(self.base_sizes)
+
+ def gen_base_anchors(self):
+ """Generate base anchors.
+
+ Returns:
+ list(torch.Tensor): Base anchors of a feature grid in multiple \
+ feature levels.
+ """
+ multi_level_base_anchors = []
+ for i, base_sizes_per_level in enumerate(self.base_sizes):
+ center = None
+ if self.centers is not None:
+ center = self.centers[i]
+ multi_level_base_anchors.append(
+ self.gen_single_level_base_anchors(base_sizes_per_level,
+ center))
+ return multi_level_base_anchors
+
+ def gen_single_level_base_anchors(self, base_sizes_per_level, center=None):
+ """Generate base anchors of a single level.
+
+ Args:
+ base_sizes_per_level (list[tuple[int, int]]): Basic sizes of
+ anchors.
+ center (tuple[float], optional): The center of the base anchor
+ related to a single feature grid. Defaults to None.
+
+ Returns:
+ torch.Tensor: Anchors in a single-level feature maps.
+ """
+ x_center, y_center = center
+ base_anchors = []
+ for base_size in base_sizes_per_level:
+ w, h = base_size
+
+ # use float anchor and the anchor's center is aligned with the
+ # pixel center
+ base_anchor = torch.Tensor([
+ x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+ y_center + 0.5 * h
+ ])
+ base_anchors.append(base_anchor)
+ base_anchors = torch.stack(base_anchors, dim=0)
+
+ return base_anchors
+
+ def responsible_flags(self, featmap_sizes, gt_bboxes, device='cuda'):
+ """Generate responsible anchor flags of grid cells in multiple scales.
+
+ Args:
+ featmap_sizes (list(tuple)): List of feature map sizes in multiple
+ feature levels.
+ gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+ device (str): Device where the anchors will be put on.
+
+ Return:
+ list(torch.Tensor): responsible flags of anchors in multiple level
+ """
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_responsible_flags = []
+ for i in range(self.num_levels):
+ anchor_stride = self.strides[i]
+ flags = self.single_level_responsible_flags(
+ featmap_sizes[i],
+ gt_bboxes,
+ anchor_stride,
+ self.num_base_anchors[i],
+ device=device)
+ multi_level_responsible_flags.append(flags)
+ return multi_level_responsible_flags
+
+ def single_level_responsible_flags(self,
+ featmap_size,
+ gt_bboxes,
+ stride,
+ num_base_anchors,
+ device='cuda'):
+ """Generate the responsible flags of anchor in a single feature map.
+
+ Args:
+ featmap_size (tuple[int]): The size of feature maps.
+ gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+ stride (tuple(int)): stride of current level
+ num_base_anchors (int): The number of base anchors.
+ device (str, optional): Device where the flags will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: The valid flags of each anchor in a single level \
+ feature map.
+ """
+ feat_h, feat_w = featmap_size
+ gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+ gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+ gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / stride[0]).long()
+ gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / stride[1]).long()
+
+ # row major indexing
+ gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+
+ responsible_grid = torch.zeros(
+ feat_h * feat_w, dtype=torch.uint8, device=device)
+ responsible_grid[gt_bboxes_grid_idx] = 1
+
+ responsible_grid = responsible_grid[:, None].expand(
+ responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+ return responsible_grid
diff --git a/mmcv/core/anchor/builder.py b/mmcv/core/anchor/builder.py
new file mode 100644
index 0000000..d53a624
--- /dev/null
+++ b/mmcv/core/anchor/builder.py
@@ -0,0 +1,18 @@
+import warnings
+
+from mmcv.utils import Registry, build_from_cfg
+
+PRIOR_GENERATORS = Registry('Generator for anchors and points')
+
+ANCHOR_GENERATORS = PRIOR_GENERATORS
+
+
+def build_prior_generator(cfg, default_args=None):
+ return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+ warnings.warn(
+ '``build_anchor_generator`` would be deprecated soon, please use '
+ '``build_prior_generator`` ')
+ return build_prior_generator(cfg, default_args=default_args)
diff --git a/mmcv/core/anchor/point_generator.py b/mmcv/core/anchor/point_generator.py
new file mode 100644
index 0000000..7b11a85
--- /dev/null
+++ b/mmcv/core/anchor/point_generator.py
@@ -0,0 +1,241 @@
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class PointGenerator:
+
+ def _meshgrid(self, x, y, row_major=True):
+ xx = x.repeat(len(y))
+ yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+ if row_major:
+ return xx, yy
+ else:
+ return yy, xx
+
+ def grid_points(self, featmap_size, stride=16, device='cuda'):
+ feat_h, feat_w = featmap_size
+ shift_x = torch.arange(0., feat_w, device=device) * stride
+ shift_y = torch.arange(0., feat_h, device=device) * stride
+ shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+ stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+ shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+ all_points = shifts.to(device)
+ return all_points
+
+ def valid_flags(self, featmap_size, valid_size, device='cuda'):
+ feat_h, feat_w = featmap_size
+ valid_h, valid_w = valid_size
+ assert valid_h <= feat_h and valid_w <= feat_w
+ valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+ valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+ valid_x[:valid_w] = 1
+ valid_y[:valid_h] = 1
+ valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+ valid = valid_xx & valid_yy
+ return valid
+
+
+@PRIOR_GENERATORS.register_module()
+class MlvlPointGenerator:
+ """Standard points generator for multi-level (Mlvl) feature maps in 2D
+ points-based detectors.
+
+ Args:
+ strides (list[int] | list[tuple[int, int]]): Strides of anchors
+ in multiple feature levels in order (w, h).
+ offset (float): The offset of points, the value is normalized with
+ corresponding stride. Defaults to 0.5.
+ """
+
+ def __init__(self, strides, offset=0.5):
+ self.strides = [_pair(stride) for stride in strides]
+ self.offset = offset
+
+ @property
+ def num_levels(self):
+ """int: number of feature levels that the generator will be applied"""
+ return len(self.strides)
+
+ @property
+ def num_base_priors(self):
+ """list[int]: The number of priors (points) at a point
+ on the feature grid"""
+ return [1 for _ in range(len(self.strides))]
+
+ def _meshgrid(self, x, y, row_major=True):
+ xx = x.repeat(len(y))
+ yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+ if row_major:
+ return xx, yy
+ else:
+ return yy, xx
+
+ def grid_priors(self, featmap_sizes, device='cuda', with_stride=False):
+ """Generate grid points of multiple feature levels.
+
+ Args:
+ featmap_sizes (list[tuple]): List of feature map sizes in
+ multiple feature levels, each size arrange as
+ as (h, w).
+ device (str): The device where the anchors will be put on.
+ with_stride (bool): Whether to concatenate the stride to
+ the last dimension of points.
+
+ Return:
+ list[torch.Tensor]: Points of multiple feature levels.
+ The sizes of each tensor should be (N, 2) when with stride is
+ ``False``, where N = width * height, width and height
+ are the sizes of the corresponding feature level,
+ and the last dimension 2 represent (coord_x, coord_y),
+ otherwise the shape should be (N, 4),
+ and the last dimension 4 represent
+ (coord_x, coord_y, stride_w, stride_h).
+ """
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_priors = []
+ for i in range(self.num_levels):
+ priors = self.single_level_grid_priors(
+ featmap_sizes[i],
+ level_idx=i,
+ device=device,
+ with_stride=with_stride)
+ multi_level_priors.append(priors)
+ return multi_level_priors
+
+ def single_level_grid_priors(self,
+ featmap_size,
+ level_idx,
+ device='cuda',
+ with_stride=False):
+ """Generate grid Points of a single level.
+
+ Note:
+ This function is usually called by method ``self.grid_priors``.
+
+ Args:
+ featmap_size (tuple[int]): Size of the feature maps, arrange as
+ (h, w).
+ level_idx (int): The index of corresponding feature map level.
+ device (str, optional): The device the tensor will be put on.
+ Defaults to 'cuda'.
+ with_stride (bool): Concatenate the stride to the last dimension
+ of points.
+
+ Return:
+ Tensor: Points of single feature levels.
+ The shape of tensor should be (N, 2) when with stride is
+ ``False``, where N = width * height, width and height
+ are the sizes of the corresponding feature level,
+ and the last dimension 2 represent (coord_x, coord_y),
+ otherwise the shape should be (N, 4),
+ and the last dimension 4 represent
+ (coord_x, coord_y, stride_w, stride_h).
+ """
+ feat_h, feat_w = featmap_size
+ stride_w, stride_h = self.strides[level_idx]
+ shift_x = (torch.arange(0., feat_w, device=device) +
+ self.offset) * stride_w
+ shift_y = (torch.arange(0., feat_h, device=device) +
+ self.offset) * stride_h
+ shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+ if not with_stride:
+ shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+ else:
+ stride_w = shift_xx.new_full((len(shift_xx), ), stride_w)
+ stride_h = shift_xx.new_full((len(shift_yy), ), stride_h)
+ shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+ dim=-1)
+ all_points = shifts.to(device)
+ return all_points
+
+ def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+ """Generate valid flags of points of multiple feature levels.
+
+ Args:
+ featmap_sizes (list(tuple)): List of feature map sizes in
+ multiple feature levels, each size arrange as
+ as (h, w).
+ pad_shape (tuple(int)): The padded shape of the image,
+ arrange as (h, w).
+ device (str): The device where the anchors will be put on.
+
+ Return:
+ list(torch.Tensor): Valid flags of points of multiple levels.
+ """
+ assert self.num_levels == len(featmap_sizes)
+ multi_level_flags = []
+ for i in range(self.num_levels):
+ point_stride = self.strides[i]
+ feat_h, feat_w = featmap_sizes[i]
+ h, w = pad_shape[:2]
+ valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+ valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+ flags = self.single_level_valid_flags((feat_h, feat_w),
+ (valid_feat_h, valid_feat_w),
+ device=device)
+ multi_level_flags.append(flags)
+ return multi_level_flags
+
+ def single_level_valid_flags(self,
+ featmap_size,
+ valid_size,
+ device='cuda'):
+ """Generate the valid flags of points of a single feature map.
+
+ Args:
+ featmap_size (tuple[int]): The size of feature maps, arrange as
+ as (h, w).
+ valid_size (tuple[int]): The valid size of the feature maps.
+ The size arrange as as (h, w).
+ device (str, optional): The device where the flags will be put on.
+ Defaults to 'cuda'.
+
+ Returns:
+ torch.Tensor: The valid flags of each points in a single level \
+ feature map.
+ """
+ feat_h, feat_w = featmap_size
+ valid_h, valid_w = valid_size
+ assert valid_h <= feat_h and valid_w <= feat_w
+ valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+ valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+ valid_x[:valid_w] = 1
+ valid_y[:valid_h] = 1
+ valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+ valid = valid_xx & valid_yy
+ return valid
+
+ def sparse_priors(self,
+ prior_idxs,
+ featmap_size,
+ level_idx,
+ dtype=torch.float32,
+ device='cuda'):
+ """Generate sparse points according to the ``prior_idxs``.
+
+ Args:
+ prior_idxs (Tensor): The index of corresponding anchors
+ in the feature map.
+ featmap_size (tuple[int]): feature map size arrange as (w, h).
+ level_idx (int): The level index of corresponding feature
+ map.
+ dtype (obj:`torch.dtype`): Date type of points. Defaults to
+ ``torch.float32``.
+ device (obj:`torch.device`): The device where the points is
+ located.
+ Returns:
+ Tensor: Anchor with shape (N, 2), N should be equal to
+ the length of ``prior_idxs``. And last dimension
+ 2 represent (coord_x, coord_y).
+ """
+ height, width = featmap_size
+ x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+ y = ((prior_idxs // width) % height +
+ self.offset) * self.strides[level_idx][1]
+ prioris = torch.stack([x, y], 1).to(dtype)
+ prioris = prioris.to(device)
+ return prioris
diff --git a/mmcv/core/anchor/utils.py b/mmcv/core/anchor/utils.py
new file mode 100644
index 0000000..ab9b53f
--- /dev/null
+++ b/mmcv/core/anchor/utils.py
@@ -0,0 +1,71 @@
+import torch
+
+
+def images_to_levels(target, num_levels):
+ """Convert targets by image to targets by feature level.
+
+ [target_img0, target_img1] -> [target_level0, target_level1, ...]
+ """
+ target = torch.stack(target, 0)
+ level_targets = []
+ start = 0
+ for n in num_levels:
+ end = start + n
+ # level_targets.append(target[:, start:end].squeeze(0))
+ level_targets.append(target[:, start:end])
+ start = end
+ return level_targets
+
+
+def anchor_inside_flags(flat_anchors,
+ valid_flags,
+ img_shape,
+ allowed_border=0):
+ """Check whether the anchors are inside the border.
+
+ Args:
+ flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+ valid_flags (torch.Tensor): An existing valid flags of anchors.
+ img_shape (tuple(int)): Shape of current image.
+ allowed_border (int, optional): The border to allow the valid anchor.
+ Defaults to 0.
+
+ Returns:
+ torch.Tensor: Flags indicating whether the anchors are inside a \
+ valid range.
+ """
+ img_h, img_w = img_shape[:2]
+ if allowed_border >= 0:
+ inside_flags = valid_flags & \
+ (flat_anchors[:, 0] >= -allowed_border) & \
+ (flat_anchors[:, 1] >= -allowed_border) & \
+ (flat_anchors[:, 2] < img_w + allowed_border) & \
+ (flat_anchors[:, 3] < img_h + allowed_border)
+ else:
+ inside_flags = valid_flags
+ return inside_flags
+
+
+def calc_region(bbox, ratio, featmap_size=None):
+ """Calculate a proportional bbox region.
+
+ The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+ Args:
+ bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+ ratio (float): Ratio of the output region.
+ featmap_size (tuple): Feature map size used for clipping the boundary.
+
+ Returns:
+ tuple: x1, y1, x2, y2
+ """
+ x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+ y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+ x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+ y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+ if featmap_size is not None:
+ x1 = x1.clamp(min=0, max=featmap_size[1])
+ y1 = y1.clamp(min=0, max=featmap_size[0])
+ x2 = x2.clamp(min=0, max=featmap_size[1])
+ y2 = y2.clamp(min=0, max=featmap_size[0])
+ return (x1, y1, x2, y2)
diff --git a/mmcv/core/bbox/__init__.py b/mmcv/core/bbox/__init__.py
new file mode 100644
index 0000000..3399260
--- /dev/null
+++ b/mmcv/core/bbox/__init__.py
@@ -0,0 +1,13 @@
+from .builder import build_assigner, build_bbox_coder, build_sampler
+from .samplers import (PseudoSampler)
+from .structures import (get_box_type, limit_period,
+ mono_cam_box2vis, points_cam2img, xywhr2xyxyr)
+from .transforms import (bbox2distance, bbox2result, bbox2roi,
+ bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+ bbox_mapping_back, bbox_rescale, bbox_xyxy_to_cxcywh,
+ distance2bbox, roi2bbox,
+ bbox3d2result, bbox3d2roi, bbox3d_mapping_back)
+from .iou_calculators import (BboxOverlaps2D, bbox_overlaps, AxisAlignedBboxOverlaps3D,
+ BboxOverlaps3D, BboxOverlapsNearest3D,
+ axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+ bbox_overlaps_nearest_3d)
\ No newline at end of file
diff --git a/mmcv/core/bbox/assigners/__init__.py b/mmcv/core/bbox/assigners/__init__.py
new file mode 100644
index 0000000..9c6d438
--- /dev/null
+++ b/mmcv/core/bbox/assigners/__init__.py
@@ -0,0 +1,10 @@
+from .hungarian_assigner import HungarianAssigner
+from .hungarian_assigner_3d import HungarianAssigner3D
+from .hungarian_assigner_3d_track import HungarianAssigner3DTrack
+from .base_assigner import BaseAssigner
+from .map_hungarian_assigner_3d import MapHungarianAssigner3D
+
+# __all__ = [
+# 'HungarianAssigner',
+
+# ]
diff --git a/mmcv/core/bbox/assigners/assign_result.py b/mmcv/core/bbox/assigners/assign_result.py
new file mode 100644
index 0000000..f3b9543
--- /dev/null
+++ b/mmcv/core/bbox/assigners/assign_result.py
@@ -0,0 +1,204 @@
+import torch
+
+from mmcv.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+ """Stores assignments between predicted and truth boxes.
+
+ Attributes:
+ num_gts (int): the number of truth boxes considered when computing this
+ assignment
+
+ gt_inds (LongTensor): for each predicted box indicates the 1-based
+ index of the assigned truth box. 0 means unassigned and -1 means
+ ignore.
+
+ max_overlaps (FloatTensor): the iou between the predicted box and its
+ assigned truth box.
+
+ labels (None | LongTensor): If specified, for each predicted box
+ indicates the category label of the assigned truth box.
+
+ Example:
+ >>> # An assign result between 4 predicted boxes and 9 true boxes
+ >>> # where only two boxes were assigned.
+ >>> num_gts = 9
+ >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+ >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+ >>> labels = torch.LongTensor([0, 3, 4, 0])
+ >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+ >>> print(str(self)) # xdoctest: +IGNORE_WANT
+
+ >>> # Force addition of gt labels (when adding gt as proposals)
+ >>> new_labels = torch.LongTensor([3, 4, 5])
+ >>> self.add_gt_(new_labels)
+ >>> print(str(self)) # xdoctest: +IGNORE_WANT
+
+ """
+
+ def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+ self.num_gts = num_gts
+ self.gt_inds = gt_inds
+ self.max_overlaps = max_overlaps
+ self.labels = labels
+ # Interface for possible user-defined properties
+ self._extra_properties = {}
+
+ @property
+ def num_preds(self):
+ """int: the number of predictions in this assignment"""
+ return len(self.gt_inds)
+
+ def set_extra_property(self, key, value):
+ """Set user-defined new property."""
+ assert key not in self.info
+ self._extra_properties[key] = value
+
+ def get_extra_property(self, key):
+ """Get user-defined property."""
+ return self._extra_properties.get(key, None)
+
+ @property
+ def info(self):
+ """dict: a dictionary of info about the object"""
+ basic_info = {
+ 'num_gts': self.num_gts,
+ 'num_preds': self.num_preds,
+ 'gt_inds': self.gt_inds,
+ 'max_overlaps': self.max_overlaps,
+ 'labels': self.labels,
+ }
+ basic_info.update(self._extra_properties)
+ return basic_info
+
+ def __nice__(self):
+ """str: a "nice" summary string describing this assign result"""
+ parts = []
+ parts.append(f'num_gts={self.num_gts!r}')
+ if self.gt_inds is None:
+ parts.append(f'gt_inds={self.gt_inds!r}')
+ else:
+ parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+ if self.max_overlaps is None:
+ parts.append(f'max_overlaps={self.max_overlaps!r}')
+ else:
+ parts.append('max_overlaps.shape='
+ f'{tuple(self.max_overlaps.shape)!r}')
+ if self.labels is None:
+ parts.append(f'labels={self.labels!r}')
+ else:
+ parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+ return ', '.join(parts)
+
+ @classmethod
+ def random(cls, **kwargs):
+ """Create random AssignResult for tests or debugging.
+
+ Args:
+ num_preds: number of predicted boxes
+ num_gts: number of true boxes
+ p_ignore (float): probability of a predicted box assigned to an
+ ignored truth
+ p_assigned (float): probability of a predicted box not being
+ assigned
+ p_use_label (float | bool): with labels or not
+ rng (None | int | numpy.random.RandomState): seed or state
+
+ Returns:
+ :obj:`AssignResult`: Randomly generated assign results.
+
+ Example:
+ >>> from mmcv.core.bbox.assigners.assign_result import * # NOQA
+ >>> self = AssignResult.random()
+ >>> print(self.info)
+ """
+ from mmcv.core.bbox import demodata
+ rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+ num_gts = kwargs.get('num_gts', None)
+ num_preds = kwargs.get('num_preds', None)
+ p_ignore = kwargs.get('p_ignore', 0.3)
+ p_assigned = kwargs.get('p_assigned', 0.7)
+ p_use_label = kwargs.get('p_use_label', 0.5)
+ num_classes = kwargs.get('p_use_label', 3)
+
+ if num_gts is None:
+ num_gts = rng.randint(0, 8)
+ if num_preds is None:
+ num_preds = rng.randint(0, 16)
+
+ if num_gts == 0:
+ max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+ gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+ if p_use_label is True or p_use_label < rng.rand():
+ labels = torch.zeros(num_preds, dtype=torch.int64)
+ else:
+ labels = None
+ else:
+ import numpy as np
+ # Create an overlap for each predicted box
+ max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+ # Construct gt_inds for each predicted box
+ is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+ # maximum number of assignments constraints
+ n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+ assigned_idxs = np.where(is_assigned)[0]
+ rng.shuffle(assigned_idxs)
+ assigned_idxs = assigned_idxs[0:n_assigned]
+ assigned_idxs.sort()
+
+ is_assigned[:] = 0
+ is_assigned[assigned_idxs] = True
+
+ is_ignore = torch.from_numpy(
+ rng.rand(num_preds) < p_ignore) & is_assigned
+
+ gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+ true_idxs = np.arange(num_gts)
+ rng.shuffle(true_idxs)
+ true_idxs = torch.from_numpy(true_idxs)
+ gt_inds[is_assigned] = true_idxs[:n_assigned]
+
+ gt_inds = torch.from_numpy(
+ rng.randint(1, num_gts + 1, size=num_preds))
+ gt_inds[is_ignore] = -1
+ gt_inds[~is_assigned] = 0
+ max_overlaps[~is_assigned] = 0
+
+ if p_use_label is True or p_use_label < rng.rand():
+ if num_classes == 0:
+ labels = torch.zeros(num_preds, dtype=torch.int64)
+ else:
+ labels = torch.from_numpy(
+ # remind that we set FG labels to [0, num_class-1]
+ # since mmcv v2.0
+ # BG cat_id: num_class
+ rng.randint(0, num_classes, size=num_preds))
+ labels[~is_assigned] = 0
+ else:
+ labels = None
+
+ self = cls(num_gts, gt_inds, max_overlaps, labels)
+ return self
+
+ def add_gt_(self, gt_labels):
+ """Add ground truth as assigned results.
+
+ Args:
+ gt_labels (torch.Tensor): Labels of gt boxes
+ """
+ self_inds = torch.arange(
+ 1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+ self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+ self.max_overlaps = torch.cat(
+ [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+ if self.labels is not None:
+ self.labels = torch.cat([gt_labels, self.labels])
diff --git a/mmcv/core/bbox/assigners/base_assigner.py b/mmcv/core/bbox/assigners/base_assigner.py
new file mode 100644
index 0000000..1ff0160
--- /dev/null
+++ b/mmcv/core/bbox/assigners/base_assigner.py
@@ -0,0 +1,9 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseAssigner(metaclass=ABCMeta):
+ """Base assigner that assigns boxes to ground truth boxes."""
+
+ @abstractmethod
+ def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+ """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/mmcv/core/bbox/assigners/hungarian_assigner.py b/mmcv/core/bbox/assigners/hungarian_assigner.py
new file mode 100644
index 0000000..e10cc14
--- /dev/null
+++ b/mmcv/core/bbox/assigners/hungarian_assigner.py
@@ -0,0 +1,145 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..match_costs import build_match_cost
+from ..transforms import bbox_cxcywh_to_xyxy
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+try:
+ from scipy.optimize import linear_sum_assignment
+except ImportError:
+ linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner(BaseAssigner):
+ """Computes one-to-one matching between predictions and ground truth.
+
+ This class computes an assignment between the targets and the predictions
+ based on the costs. The costs are weighted sum of three components:
+ classification cost, regression L1 cost and regression iou cost. The
+ targets don't include the no_object, so generally there are more
+ predictions than targets. After the one-to-one matching, the un-matched
+ are treated as backgrounds. Thus each query prediction will be assigned
+ with `0` or a positive integer indicating the ground truth index:
+
+ - 0: negative sample, no assigned gt
+ - positive integer: positive sample, index (1-based) of assigned gt
+
+ Args:
+ cls_weight (int | float, optional): The scale factor for classification
+ cost. Default 1.0.
+ bbox_weight (int | float, optional): The scale factor for regression
+ L1 cost. Default 1.0.
+ iou_weight (int | float, optional): The scale factor for regression
+ iou cost. Default 1.0.
+ iou_calculator (dict | optional): The config for the iou calculation.
+ Default type `BboxOverlaps2D`.
+ iou_mode (str | optional): "iou" (intersection over union), "iof"
+ (intersection over foreground), or "giou" (generalized
+ intersection over union). Default "giou".
+ """
+
+ def __init__(self,
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)):
+ self.cls_cost = build_match_cost(cls_cost)
+ self.reg_cost = build_match_cost(reg_cost)
+ self.iou_cost = build_match_cost(iou_cost)
+
+ def assign(self,
+ bbox_pred,
+ cls_pred,
+ gt_bboxes,
+ gt_labels,
+ img_meta,
+ gt_bboxes_ignore=None,
+ eps=1e-7):
+ """Computes one-to-one matching based on the weighted costs.
+
+ This method assign each query prediction to a ground truth or
+ background. The `assigned_gt_inds` with -1 means don't care,
+ 0 means negative sample, and positive number is the index (1-based)
+ of assigned gt.
+ The assignment is done in the following steps, the order matters.
+
+ 1. assign every prediction to -1
+ 2. compute the weighted costs
+ 3. do Hungarian matching on CPU based on the costs
+ 4. assign all to 0 (background) first, then for each matched pair
+ between predictions and gts, treat this prediction as foreground
+ and assign the corresponding gt index (plus 1) to it.
+
+ Args:
+ bbox_pred (Tensor): Predicted boxes with normalized coordinates
+ (cx, cy, w, h), which are all in range [0, 1]. Shape
+ [num_query, 4].
+ cls_pred (Tensor): Predicted classification logits, shape
+ [num_query, num_class].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+ gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+ img_meta (dict): Meta information for current image.
+ gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+ labelled as `ignored`. Default None.
+ eps (int | float, optional): A value added to the denominator for
+ numerical stability. Default 1e-7.
+
+ Returns:
+ :obj:`AssignResult`: The assigned result.
+ """
+ assert gt_bboxes_ignore is None, \
+ 'Only case when gt_bboxes_ignore is None is supported.'
+ num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+ # 1. assign -1 by default
+ assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ assigned_labels = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ if num_gts == 0 or num_bboxes == 0:
+ # No ground truth or boxes, return empty assignment
+ if num_gts == 0:
+ # No ground truth, assign all to background
+ assigned_gt_inds[:] = 0
+ return AssignResult(
+ num_gts, assigned_gt_inds, None, labels=assigned_labels)
+ img_h, img_w, _ = img_meta['img_shape']
+ factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+ img_h]).unsqueeze(0)
+
+ # 2. compute the weighted costs
+ # classification and bboxcost.
+ cls_cost = self.cls_cost(cls_pred, gt_labels)
+ # regression L1 cost
+ normalize_gt_bboxes = gt_bboxes / factor
+ reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
+ # regression iou cost, defaultly giou is used in official DETR.
+ bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+ iou_cost = self.iou_cost(bboxes, gt_bboxes)
+ # weighted sum of above three costs
+ cost = cls_cost + reg_cost + iou_cost
+
+ # 3. do Hungarian matching on CPU using linear_sum_assignment
+ cost = cost.detach().cpu()
+ if linear_sum_assignment is None:
+ raise ImportError('Please run "pip install scipy" '
+ 'to install scipy first.')
+ matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+ matched_row_inds = torch.from_numpy(matched_row_inds).to(
+ bbox_pred.device)
+ matched_col_inds = torch.from_numpy(matched_col_inds).to(
+ bbox_pred.device)
+
+ # 4. assign backgrounds and foregrounds
+ # assign all indices to backgrounds first
+ assigned_gt_inds[:] = 0
+ # assign foregrounds based on matching results
+ assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+ assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+ return AssignResult(
+ num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmcv/core/bbox/assigners/hungarian_assigner_3d.py b/mmcv/core/bbox/assigners/hungarian_assigner_3d.py
new file mode 100755
index 0000000..86d6cf2
--- /dev/null
+++ b/mmcv/core/bbox/assigners/hungarian_assigner_3d.py
@@ -0,0 +1,136 @@
+import torch
+
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+from mmcv.core.bbox.match_costs import build_match_cost
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.core.bbox.util import normalize_bbox
+
+try:
+ from scipy.optimize import linear_sum_assignment
+except ImportError:
+ linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+ """Computes one-to-one matching between predictions and ground truth.
+ This class computes an assignment between the targets and the predictions
+ based on the costs. The costs are weighted sum of three components:
+ classification cost, regression L1 cost and regression iou cost. The
+ targets don't include the no_object, so generally there are more
+ predictions than targets. After the one-to-one matching, the un-matched
+ are treated as backgrounds. Thus each query prediction will be assigned
+ with `0` or a positive integer indicating the ground truth index:
+ - 0: negative sample, no assigned gt
+ - positive integer: positive sample, index (1-based) of assigned gt
+ Args:
+ cls_weight (int | float, optional): The scale factor for classification
+ cost. Default 1.0.
+ bbox_weight (int | float, optional): The scale factor for regression
+ L1 cost. Default 1.0.
+ iou_weight (int | float, optional): The scale factor for regression
+ iou cost. Default 1.0.
+ iou_calculator (dict | optional): The config for the iou calculation.
+ Default type `BboxOverlaps2D`.
+ iou_mode (str | optional): "iou" (intersection over union), "iof"
+ (intersection over foreground), or "giou" (generalized
+ intersection over union). Default "giou".
+ """
+
+ def __init__(self,
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pc_range=None):
+ self.cls_cost = build_match_cost(cls_cost)
+ self.reg_cost = build_match_cost(reg_cost)
+ self.iou_cost = build_match_cost(iou_cost)
+ self.pc_range = pc_range
+
+ def assign(self,
+ bbox_pred,
+ cls_pred,
+ gt_bboxes,
+ gt_labels,
+ gt_bboxes_ignore=None,
+ eps=1e-7):
+ """Computes one-to-one matching based on the weighted costs.
+ This method assign each query prediction to a ground truth or
+ background. The `assigned_gt_inds` with -1 means don't care,
+ 0 means negative sample, and positive number is the index (1-based)
+ of assigned gt.
+ The assignment is done in the following steps, the order matters.
+ 1. assign every prediction to -1
+ 2. compute the weighted costs
+ 3. do Hungarian matching on CPU based on the costs
+ 4. assign all to 0 (background) first, then for each matched pair
+ between predictions and gts, treat this prediction as foreground
+ and assign the corresponding gt index (plus 1) to it.
+ Args:
+ bbox_pred (Tensor): Predicted boxes with normalized coordinates
+ (cx, cy, w, h), which are all in range [0, 1]. Shape
+ [num_query, 4].
+ cls_pred (Tensor): Predicted classification logits, shape
+ [num_query, num_class].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+ gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+ gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+ labelled as `ignored`. Default None.
+ eps (int | float, optional): A value added to the denominator for
+ numerical stability. Default 1e-7.
+ Returns:
+ :obj:`AssignResult`: The assigned result.
+ """
+ assert gt_bboxes_ignore is None, \
+ 'Only case when gt_bboxes_ignore is None is supported.'
+ num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+ # 1. assign -1 by default
+ assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ assigned_labels = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ if num_gts == 0 or num_bboxes == 0:
+ # No ground truth or boxes, return empty assignment
+ if num_gts == 0:
+ # No ground truth, assign all to background
+ assigned_gt_inds[:] = 0
+ return AssignResult(
+ num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+ # 2. compute the weighted costs
+ # classification and bboxcost.
+ cls_cost = self.cls_cost(cls_pred, gt_labels)
+ # regression L1 cost
+
+ normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+
+ reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+
+ # weighted sum of above two costs
+ cost = cls_cost + reg_cost
+
+ # 3. do Hungarian matching on CPU using linear_sum_assignment
+ cost = cost.detach().cpu()
+ if linear_sum_assignment is None:
+ raise ImportError('Please run "pip install scipy" '
+ 'to install scipy first.')
+ matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+ matched_row_inds = torch.from_numpy(matched_row_inds).to(
+ bbox_pred.device)
+ matched_col_inds = torch.from_numpy(matched_col_inds).to(
+ bbox_pred.device)
+
+ # 4. assign backgrounds and foregrounds
+ # assign all indices to backgrounds first
+ assigned_gt_inds[:] = 0
+ # assign foregrounds based on matching results
+ assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+ assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+ return AssignResult(
+ num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
diff --git a/mmcv/core/bbox/assigners/hungarian_assigner_3d_track.py b/mmcv/core/bbox/assigners/hungarian_assigner_3d_track.py
new file mode 100644
index 0000000..792d0f9
--- /dev/null
+++ b/mmcv/core/bbox/assigners/hungarian_assigner_3d_track.py
@@ -0,0 +1,122 @@
+import numpy as np
+import torch
+
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+from mmcv.core.bbox.assigners.base_assigner import BaseAssigner
+from mmcv.core.bbox.match_costs import build_match_cost
+try:
+ from scipy.optimize import linear_sum_assignment
+except ImportError:
+ linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3DTrack(BaseAssigner):
+ """Computes one-to-one matching between predictions and ground truth.
+ This class computes an assignment between the targets and the predictions
+ based on the costs. The costs are weighted sum of three components:
+ classification cost, regression L1 cost and regression iou cost. The
+ targets don't include the no_object, so generally there are more
+ predictions than targets. After the one-to-one matching, the un-matched
+ are treated as backgrounds. Thus each query prediction will be assigned
+ with `0` or a positive integer indicating the ground truth index:
+ - 0: negative sample, no assigned gt
+ - positive integer: positive sample, index (1-based) of assigned gt
+ Args:
+ cls_weight (int | float, optional): The scale factor for classification
+ cost. Default 1.0.
+ bbox_weight (int | float, optional): The scale factor for regression
+ L1 cost. Default 1.0.
+ """
+
+ def __init__(self,
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+ pc_range=None):
+ self.cls_cost = build_match_cost(cls_cost)
+ self.reg_cost = build_match_cost(reg_cost)
+ self.pc_range = pc_range
+
+ def assign(self,
+ bbox_pred,
+ cls_pred,
+ gt_bboxes,
+ gt_labels,
+ gt_bboxes_ignore=None,
+ eps=1e-7):
+ """Computes one-to-one matching based on the weighted costs.
+ This method assign each query prediction to a ground truth or
+ background. The `assigned_gt_inds` with -1 means don't care,
+ 0 means negative sample, and positive number is the index (1-based)
+ of assigned gt.
+ The assignment is done in the following steps, the order matters.
+ 1. assign every prediction to -1
+ 2. compute the weighted costs
+ 3. do Hungarian matching on CPU based on the costs
+ 4. assign all to 0 (background) first, then for each matched pair
+ between predictions and gts, treat this prediction as foreground
+ and assign the corresponding gt index (plus 1) to it.
+ Args:
+ bbox_pred (Tensor): Predicted boxes with normalized coordinates
+ (cx, cy, w, h), which are all in range [0, 1]. Shape
+ [num_query, 4].
+ cls_pred (Tensor): Predicted classification logits, shape
+ [num_query, num_class].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+ gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+ gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+ labelled as `ignored`. Default None.
+ eps (int | float, optional): A value added to the denominator for
+ numerical stability. Default 1e-7.
+ Returns:
+ :obj:`AssignResult`: The assigned result.
+ """
+ assert gt_bboxes_ignore is None, \
+ 'Only case when gt_bboxes_ignore is None is supported.'
+ num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+ # 1. assign -1 by default
+ assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ assigned_labels = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ if num_gts == 0 or num_bboxes == 0:
+ # No ground truth or boxes, return empty assignment
+ if num_gts == 0:
+ # No ground truth, assign all to background
+ assigned_gt_inds[:] = 0
+ return (None, None)
+ # 2. compute the weighted costs
+ # classification and bboxcost.
+ cls_cost = self.cls_cost(cls_pred, gt_labels)
+ # regression L1 cost
+ reg_cost = self.reg_cost(bbox_pred[:, :8], gt_bboxes[:, :8])
+ # weighted sum of above three costs
+ cost = cls_cost + reg_cost
+
+ cost = torch.nan_to_num(cost)
+
+ # 3. do Hungarian matching on CPU using linear_sum_assignment
+ cost = cost.detach().cpu()
+ if linear_sum_assignment is None:
+ raise ImportError('Please run "pip install scipy" '
+ 'to install scipy first.')
+ cost = np.nan_to_num(cost)
+ matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+ matched_row_inds = torch.from_numpy(matched_row_inds).to(
+ bbox_pred.device)
+ matched_col_inds = torch.from_numpy(matched_col_inds).to(
+ bbox_pred.device)
+
+ # 4. assign backgrounds and foregrounds
+ # assign all indices to backgrounds first
+ assigned_gt_inds[:] = 0
+ # assign foregrounds based on matching results
+ assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+ assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+
+ return (matched_row_inds, matched_col_inds)
+
diff --git a/mmcv/core/bbox/assigners/map_hungarian_assigner_3d.py b/mmcv/core/bbox/assigners/map_hungarian_assigner_3d.py
new file mode 100644
index 0000000..2bfc278
--- /dev/null
+++ b/mmcv/core/bbox/assigners/map_hungarian_assigner_3d.py
@@ -0,0 +1,162 @@
+import torch
+import torch.nn.functional as F
+
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+from mmcv.core.bbox.assigners.assign_result import AssignResult
+from mmcv.core.bbox.assigners.base_assigner import BaseAssigner
+from mmcv.core.bbox.match_costs import build_match_cost
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.models.vad_utils.map_utils import (
+ normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox
+)
+
+try:
+ from scipy.optimize import linear_sum_assignment
+except ImportError:
+ linear_sum_assignment = None
+
+@BBOX_ASSIGNERS.register_module()
+class MapHungarianAssigner3D(BaseAssigner):
+ """Computes one-to-one matching between predictions and ground truth.
+ This class computes an assignment between the targets and the predictions
+ based on the costs. The costs are weighted sum of three components:
+ classification cost, regression L1 cost and regression iou cost. The
+ targets don't include the no_object, so generally there are more
+ predictions than targets. After the one-to-one matching, the un-matched
+ are treated as backgrounds. Thus each query prediction will be assigned
+ with `0` or a positive integer indicating the ground truth index:
+ - 0: negative sample, no assigned gt
+ - positive integer: positive sample, index (1-based) of assigned gt
+ Args:
+ cls_weight (int | float, optional): The scale factor for classification
+ cost. Default 1.0.
+ bbox_weight (int | float, optional): The scale factor for regression
+ L1 cost. Default 1.0.
+ iou_weight (int | float, optional): The scale factor for regression
+ iou cost. Default 1.0.
+ iou_calculator (dict | optional): The config for the iou calculation.
+ Default type `BboxOverlaps2D`.
+ iou_mode (str | optional): "iou" (intersection over union), "iof"
+ (intersection over foreground), or "giou" (generalized
+ intersection over union). Default "giou".
+ """
+
+ def __init__(self,
+ cls_cost=dict(type='ClassificationCost', weight=1.),
+ reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+ iou_cost=dict(type='IoUCost', weight=0.0),
+ pts_cost=dict(type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0),
+ pc_range=None):
+ self.cls_cost = build_match_cost(cls_cost)
+ self.reg_cost = build_match_cost(reg_cost)
+ self.iou_cost = build_match_cost(iou_cost)
+ self.pts_cost = build_match_cost(pts_cost)
+ self.pc_range = pc_range
+
+ def assign(self,
+ bbox_pred,
+ cls_pred,
+ pts_pred,
+ gt_bboxes,
+ gt_labels,
+ gt_pts,
+ gt_bboxes_ignore=None,
+ eps=1e-7):
+ """Computes one-to-one matching based on the weighted costs.
+ This method assign each query prediction to a ground truth or
+ background. The `assigned_gt_inds` with -1 means don't care,
+ 0 means negative sample, and positive number is the index (1-based)
+ of assigned gt.
+ The assignment is done in the following steps, the order matters.
+ 1. assign every prediction to -1
+ 2. compute the weighted costs
+ 3. do Hungarian matching on CPU based on the costs
+ 4. assign all to 0 (background) first, then for each matched pair
+ between predictions and gts, treat this prediction as foreground
+ and assign the corresponding gt index (plus 1) to it.
+ Args:
+ bbox_pred (Tensor): Predicted boxes with normalized coordinates
+ (cx, cy, w, h), which are all in range [0, 1]. Shape
+ [num_query, 4].
+ cls_pred (Tensor): Predicted classification logits, shape
+ [num_query, num_class].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+ gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+ gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+ labelled as `ignored`. Default None.
+ eps (int | float, optional): A value added to the denominator for
+ numerical stability. Default 1e-7.
+ Returns:
+ :obj:`AssignResult`: The assigned result.
+ """
+ assert gt_bboxes_ignore is None, \
+ 'Only case when gt_bboxes_ignore is None is supported.'
+ assert bbox_pred.shape[-1] == 4, \
+ 'Only support bbox pred shape is 4 dims'
+ num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+ # 1. assign -1 by default
+ assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ assigned_labels = bbox_pred.new_full((num_bboxes, ),
+ -1,
+ dtype=torch.long)
+ if num_gts == 0 or num_bboxes == 0:
+ # No ground truth or boxes, return empty assignment
+ if num_gts == 0:
+ # No ground truth, assign all to background
+ assigned_gt_inds[:] = 0
+ return AssignResult(
+ num_gts, assigned_gt_inds, None, labels=assigned_labels), None
+
+ # 2. compute the weighted costs
+ # classification and bboxcost.
+ cls_cost = self.cls_cost(cls_pred, gt_labels)
+ # regression L1 cost
+
+ normalized_gt_bboxes = normalize_2d_bbox(gt_bboxes, self.pc_range)
+ # normalized_gt_bboxes = gt_bboxes
+ # import pdb;pdb.set_trace()
+ reg_cost = self.reg_cost(bbox_pred[:, :4], normalized_gt_bboxes[:, :4])
+
+ _, num_orders, num_pts_per_gtline, num_coords = gt_pts.shape
+ normalized_gt_pts = normalize_2d_pts(gt_pts, self.pc_range)
+ num_pts_per_predline = pts_pred.size(1)
+ if num_pts_per_predline != num_pts_per_gtline:
+ pts_pred_interpolated = F.interpolate(pts_pred.permute(0,2,1),size=(num_pts_per_gtline),
+ mode='linear', align_corners=True)
+ pts_pred_interpolated = pts_pred_interpolated.permute(0,2,1).contiguous()
+ else:
+ pts_pred_interpolated = pts_pred
+ # num_q, num_pts, 2 <-> num_gt, num_pts, 2
+ pts_cost_ordered = self.pts_cost(pts_pred_interpolated, normalized_gt_pts)
+ pts_cost_ordered = pts_cost_ordered.view(num_bboxes, num_gts, num_orders)
+ pts_cost, order_index = torch.min(pts_cost_ordered, 2)
+
+ bboxes = denormalize_2d_bbox(bbox_pred, self.pc_range)
+ iou_cost = self.iou_cost(bboxes, gt_bboxes)
+ # weighted sum of above three costs
+ cost = cls_cost + reg_cost + iou_cost + pts_cost
+
+ # 3. do Hungarian matching on CPU using linear_sum_assignment
+ cost = cost.detach().cpu()
+ if linear_sum_assignment is None:
+ raise ImportError('Please run "pip install scipy" '
+ 'to install scipy first.')
+ matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+ matched_row_inds = torch.from_numpy(matched_row_inds).to(
+ bbox_pred.device)
+ matched_col_inds = torch.from_numpy(matched_col_inds).to(
+ bbox_pred.device)
+
+ # 4. assign backgrounds and foregrounds
+ # assign all indices to backgrounds first
+ assigned_gt_inds[:] = 0
+ # assign foregrounds based on matching results
+ assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+ assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+ return AssignResult(
+ num_gts, assigned_gt_inds, None, labels=assigned_labels), order_index
\ No newline at end of file
diff --git a/mmcv/core/bbox/box_np_ops.py b/mmcv/core/bbox/box_np_ops.py
new file mode 100644
index 0000000..6740e4e
--- /dev/null
+++ b/mmcv/core/bbox/box_np_ops.py
@@ -0,0 +1,896 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: clean the functions in this file and move the APIs into box structures
+# in the future
+
+import numba
+import numpy as np
+
+
+def camera_to_lidar(points, r_rect, velo2cam):
+ """Convert points in camera coordinate to lidar coordinate.
+
+ Args:
+ points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
+ r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+ specific camera coordinate (e.g. CAM2) to CAM0.
+ velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+ camera coordinate to lidar coordinate.
+
+ Returns:
+ np.ndarray, shape=[N, 3]: Points in lidar coordinate.
+ """
+ points_shape = list(points.shape[0:-1])
+ if points.shape[-1] == 3:
+ points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
+ lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
+ return lidar_points[..., :3]
+
+
+def box_camera_to_lidar(data, r_rect, velo2cam):
+ """Covert boxes in camera coordinate to lidar coordinate.
+
+ Args:
+ data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+ r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+ specific camera coordinate (e.g. CAM2) to CAM0.
+ velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+ camera coordinate to lidar coordinate.
+
+ Returns:
+ np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
+ """
+ xyz = data[:, 0:3]
+ l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6]
+ r = data[:, 6:7]
+ xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
+ return np.concatenate([xyz_lidar, w, l, h, r], axis=1)
+
+
+def corners_nd(dims, origin=0.5):
+ """Generate relative box corners based on length per dim and origin point.
+
+ Args:
+ dims (np.ndarray, shape=[N, ndim]): Array of length per dim
+ origin (list or array or float, optional): origin point relate to
+ smallest point. Defaults to 0.5
+
+ Returns:
+ np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
+ point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+ (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+ where x0 < x1, y0 < y1, z0 < z1.
+ """
+ ndim = int(dims.shape[1])
+ corners_norm = np.stack(
+ np.unravel_index(np.arange(2**ndim), [2] * ndim),
+ axis=1).astype(dims.dtype)
+ # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+ # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+ # so need to convert to a format which is convenient to do other computing.
+ # for 2d boxes, format is clockwise start with minimum point
+ # for 3d boxes, please draw lines by your hand.
+ if ndim == 2:
+ # generate clockwise box corners
+ corners_norm = corners_norm[[0, 1, 3, 2]]
+ elif ndim == 3:
+ corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+ corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+ corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+ [1, 2**ndim, ndim])
+ return corners
+
+
+def rotation_2d(points, angles):
+ """Rotation 2d points based on origin point clockwise when angle positive.
+
+ Args:
+ points (np.ndarray): Points to be rotated with shape \
+ (N, point_size, 2).
+ angles (np.ndarray): Rotation angle with shape (N).
+
+ Returns:
+ np.ndarray: Same shape as points.
+ """
+ rot_sin = np.sin(angles)
+ rot_cos = np.cos(angles)
+ rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]])
+ return np.einsum('aij,jka->aik', points, rot_mat_T)
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+ """Convert kitti locations, dimensions and angles to corners.
+ format: center(xy), dims(xy), angles(clockwise when positive)
+
+ Args:
+ centers (np.ndarray): Locations in kitti label file with shape (N, 2).
+ dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
+ angles (np.ndarray, optional): Rotation_y in kitti label file with
+ shape (N). Defaults to None.
+ origin (list or array or float, optional): origin point relate to
+ smallest point. Defaults to 0.5.
+
+ Returns:
+ np.ndarray: Corners with the shape of (N, 4, 2).
+ """
+ # 'length' in kitti format is in x axis.
+ # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+ # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+ corners = corners_nd(dims, origin=origin)
+ # corners: [N, 4, 2]
+ if angles is not None:
+ corners = rotation_2d(corners, angles)
+ corners += centers.reshape([-1, 1, 2])
+ return corners
+
+
+@numba.jit(nopython=True)
+def depth_to_points(depth, trunc_pixel):
+ """Convert depth map to points.
+
+ Args:
+ depth (np.array, shape=[H, W]): Depth map which
+ the row of [0~`trunc_pixel`] are truncated.
+ trunc_pixel (int): The number of truncated row.
+
+ Returns:
+ np.ndarray: Points in camera coordinates.
+ """
+ num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
+ points = np.zeros((num_pts, 3), dtype=depth.dtype)
+ x = np.array([0, 0, 1], dtype=depth.dtype)
+ k = 0
+ for i in range(trunc_pixel, depth.shape[0]):
+ for j in range(depth.shape[1]):
+ if depth[i, j] > 0.1:
+ x = np.array([j, i, 1], dtype=depth.dtype)
+ points[k] = x * depth[i, j]
+ k += 1
+ return points
+
+
+def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
+ """Convert depth map to points in lidar coordinate.
+
+ Args:
+ depth (np.array, shape=[H, W]): Depth map which
+ the row of [0~`trunc_pixel`] are truncated.
+ trunc_pixel (int): The number of truncated row.
+ P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+ r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+ specific camera coordinate (e.g. CAM2) to CAM0.
+ velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+ camera coordinate to lidar coordinate.
+
+ Returns:
+ np.ndarray: Points in lidar coordinates.
+ """
+ pts = depth_to_points(depth, trunc_pixel)
+ points_shape = list(pts.shape[0:-1])
+ points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
+ points = points @ np.linalg.inv(P2.T)
+ lidar_points = camera_to_lidar(points, r_rect, velo2cam)
+ return lidar_points
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+ """Rotate points in specific axis.
+
+ Args:
+ points (np.ndarray, shape=[N, point_size, 3]]):
+ angles (np.ndarray, shape=[N]]):
+ axis (int, optional): Axis to rotate at. Defaults to 0.
+
+ Returns:
+ np.ndarray: Rotated points.
+ """
+ # points: [N, point_size, 3]
+ rot_sin = np.sin(angles)
+ rot_cos = np.cos(angles)
+ ones = np.ones_like(rot_cos)
+ zeros = np.zeros_like(rot_cos)
+ if axis == 1:
+ rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros],
+ [rot_sin, zeros, rot_cos]])
+ elif axis == 2 or axis == -1:
+ rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros],
+ [rot_sin, rot_cos, zeros], [zeros, zeros, ones]])
+ elif axis == 0:
+ rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin],
+ [zeros, rot_sin, rot_cos], [ones, zeros, zeros]])
+ else:
+ raise ValueError('axis should in range')
+
+ return np.einsum('aij,jka->aik', points, rot_mat_T)
+
+
+def center_to_corner_box3d(centers,
+ dims,
+ angles=None,
+ origin=(0.5, 1.0, 0.5),
+ axis=1):
+ """Convert kitti locations, dimensions and angles to corners.
+
+ Args:
+ centers (np.ndarray): Locations in kitti label file with shape (N, 3).
+ dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
+ angles (np.ndarray, optional): Rotation_y in kitti label file with
+ shape (N). Defaults to None.
+ origin (list or array or float, optional): Origin point relate to
+ smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
+ in lidar. Defaults to (0.5, 1.0, 0.5).
+ axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
+ Defaults to 1.
+
+ Returns:
+ np.ndarray: Corners with the shape of (N, 8, 3).
+ """
+ # 'length' in kitti format is in x axis.
+ # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+ # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+ corners = corners_nd(dims, origin=origin)
+ # corners: [N, 8, 3]
+ if angles is not None:
+ corners = rotation_3d_in_axis(corners, angles, axis=axis)
+ corners += centers.reshape([-1, 1, 3])
+ return corners
+
+
+@numba.jit(nopython=True)
+def box2d_to_corner_jit(boxes):
+ """Convert box2d to corner.
+
+ Args:
+ boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
+
+ Returns:
+ box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
+ """
+ num_box = boxes.shape[0]
+ corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+ corners_norm[1, 1] = 1.0
+ corners_norm[2] = 1.0
+ corners_norm[3, 0] = 1.0
+ corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+ corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
+ 1, 4, 2)
+ rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+ box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
+ for i in range(num_box):
+ rot_sin = np.sin(boxes[i, -1])
+ rot_cos = np.cos(boxes[i, -1])
+ rot_mat_T[0, 0] = rot_cos
+ rot_mat_T[0, 1] = -rot_sin
+ rot_mat_T[1, 0] = rot_sin
+ rot_mat_T[1, 1] = rot_cos
+ box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
+ return box_corners
+
+
+@numba.njit
+def corner_to_standup_nd_jit(boxes_corner):
+ """Convert boxes_corner to aligned (min-max) boxes.
+
+ Args:
+ boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
+
+ Returns:
+ np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
+ """
+ num_boxes = boxes_corner.shape[0]
+ ndim = boxes_corner.shape[-1]
+ result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
+ for i in range(num_boxes):
+ for j in range(ndim):
+ result[i, j] = np.min(boxes_corner[i, :, j])
+ for j in range(ndim):
+ result[i, j + ndim] = np.max(boxes_corner[i, :, j])
+ return result
+
+
+@numba.jit(nopython=True)
+def corner_to_surfaces_3d_jit(corners):
+ """Convert 3d box corners from corner function above to surfaces that
+ normal vectors all direct to internal.
+
+ Args:
+ corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
+
+ Returns:
+ np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+ """
+ # box_corners: [N, 8, 3], must from corner functions in this module
+ num_boxes = corners.shape[0]
+ surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
+ corner_idxes = np.array([
+ 0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
+ ]).reshape(6, 4)
+ for i in range(num_boxes):
+ for j in range(6):
+ for k in range(4):
+ surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
+ return surfaces
+
+
+def rotation_points_single_angle(points, angle, axis=0):
+ """Rotate points with a single angle.
+
+ Args:
+ points (np.ndarray, shape=[N, 3]]):
+ angle (np.ndarray, shape=[1]]):
+ axis (int, optional): Axis to rotate at. Defaults to 0.
+
+ Returns:
+ np.ndarray: Rotated points.
+ """
+ # points: [N, 3]
+ rot_sin = np.sin(angle)
+ rot_cos = np.cos(angle)
+ if axis == 1:
+ rot_mat_T = np.array(
+ [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]],
+ dtype=points.dtype)
+ elif axis == 2 or axis == -1:
+ rot_mat_T = np.array(
+ [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]],
+ dtype=points.dtype)
+ elif axis == 0:
+ rot_mat_T = np.array(
+ [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]],
+ dtype=points.dtype)
+ else:
+ raise ValueError('axis should in range')
+
+ return points @ rot_mat_T, rot_mat_T
+
+
+def points_cam2img(points_3d, proj_mat, with_depth=False):
+ """Project points in camera coordinates to image coordinates.
+
+ Args:
+ points_3d (np.ndarray): Points in shape (N, 3)
+ proj_mat (np.ndarray): Transformation matrix between coordinates.
+ with_depth (bool, optional): Whether to keep depth in the output.
+ Defaults to False.
+
+ Returns:
+ np.ndarray: Points in image coordinates with shape [N, 2].
+ """
+ points_shape = list(points_3d.shape)
+ points_shape[-1] = 1
+
+ assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+ f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+ d1, d2 = proj_mat.shape[:2]
+ assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+ d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+ f' ({d1}*{d2}) is not supported.'
+ if d1 == 3:
+ proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
+ proj_mat_expanded[:d1, :d2] = proj_mat
+ proj_mat = proj_mat_expanded
+
+ points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
+ point_2d = points_4 @ proj_mat.T
+ point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+ if with_depth:
+ points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]],
+ axis=-1)
+ return points_2d_depth
+
+ return point_2d_res
+
+
+def box3d_to_bbox(box3d, P2):
+ """Convert box3d in camera coordinates to bbox in image coordinates.
+
+ Args:
+ box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+ P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
+
+ Returns:
+ np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
+ """
+ box_corners = center_to_corner_box3d(
+ box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
+ box_corners_in_image = points_cam2img(box_corners, P2)
+ # box_corners_in_image: [N, 8, 2]
+ minxy = np.min(box_corners_in_image, axis=1)
+ maxxy = np.max(box_corners_in_image, axis=1)
+ bbox = np.concatenate([minxy, maxxy], axis=1)
+ return bbox
+
+
+def corner_to_surfaces_3d(corners):
+ """convert 3d box corners from corner function above to surfaces that
+ normal vectors all direct to internal.
+
+ Args:
+ corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
+
+ Returns:
+ np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+ """
+ # box_corners: [N, 8, 3], must from corner functions in this module
+ surfaces = np.array([
+ [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
+ [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
+ [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
+ [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
+ [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
+ [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
+ ]).transpose([2, 0, 1, 3])
+ return surfaces
+
+
+def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
+ """Check points in rotated bbox and return indicces.
+
+ Args:
+ points (np.ndarray, shape=[N, 3+dim]): Points to query.
+ rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
+ z_axis (int, optional): Indicate which axis is height.
+ Defaults to 2.
+ origin (tuple[int], optional): Indicate the position of
+ box center. Defaults to (0.5, 0.5, 0).
+
+ Returns:
+ np.ndarray, shape=[N, M]: Indices of points in each box.
+ """
+ # TODO: this function is different from PointCloud3D, be careful
+ # when start to use nuscene, check the input
+ rbbox_corners = center_to_corner_box3d(
+ rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
+ surfaces = corner_to_surfaces_3d(rbbox_corners)
+ indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
+ return indices
+
+
+def minmax_to_corner_2d(minmax_box):
+ """Convert minmax box to corners2d.
+
+ Args:
+ minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
+
+ Returns:
+ np.ndarray: 2d corners of boxes
+ """
+ ndim = minmax_box.shape[-1] // 2
+ center = minmax_box[..., :ndim]
+ dims = minmax_box[..., ndim:] - center
+ return center_to_corner_box2d(center, dims, origin=0.0)
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+ """Limit the value into a period for periodic function.
+
+ Args:
+ val (np.ndarray): The value to be converted.
+ offset (float, optional): Offset to set the value range. \
+ Defaults to 0.5.
+ period (float, optional): Period of the value. Defaults to np.pi.
+
+ Returns:
+ torch.Tensor: Value in the range of \
+ [-offset * period, (1-offset) * period]
+ """
+ return val - np.floor(val / period + offset) * period
+
+
+def create_anchors_3d_range(feature_size,
+ anchor_range,
+ sizes=((1.6, 3.9, 1.56), ),
+ rotations=(0, np.pi / 2),
+ dtype=np.float32):
+ """Create anchors 3d by range.
+
+ Args:
+ feature_size (list[float] | tuple[float]): Feature map size. It is
+ either a list of a tuple of [D, H, W](in order of z, y, and x).
+ anchor_range (torch.Tensor | list[float]): Range of anchors with
+ shape [6]. The order is consistent with that of anchors, i.e.,
+ (x_min, y_min, z_min, x_max, y_max, z_max).
+ sizes (list[list] | np.ndarray | torch.Tensor, optional):
+ Anchor size with shape [N, 3], in order of x, y, z.
+ Defaults to ((1.6, 3.9, 1.56), ).
+ rotations (list[float] | np.ndarray | torch.Tensor, optional):
+ Rotations of anchors in a single feature grid.
+ Defaults to (0, np.pi / 2).
+ dtype (type, optional): Data type. Default to np.float32.
+
+ Returns:
+ np.ndarray: Range based anchors with shape of \
+ (*feature_size, num_sizes, num_rots, 7).
+ """
+ anchor_range = np.array(anchor_range, dtype)
+ z_centers = np.linspace(
+ anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
+ y_centers = np.linspace(
+ anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
+ x_centers = np.linspace(
+ anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
+ sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
+ rotations = np.array(rotations, dtype=dtype)
+ rets = np.meshgrid(
+ x_centers, y_centers, z_centers, rotations, indexing='ij')
+ tile_shape = [1] * 5
+ tile_shape[-2] = int(sizes.shape[0])
+ for i in range(len(rets)):
+ rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
+ rets[i] = rets[i][..., np.newaxis] # for concat
+ sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
+ tile_size_shape = list(rets[0].shape)
+ tile_size_shape[3] = 1
+ sizes = np.tile(sizes, tile_size_shape)
+ rets.insert(3, sizes)
+ ret = np.concatenate(rets, axis=-1)
+ return np.transpose(ret, [2, 1, 0, 3, 4, 5])
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+ """Center to minmax.
+
+ Args:
+ centers (np.ndarray): Center points.
+ dims (np.ndarray): Dimensions.
+ origin (list or array or float, optional): Origin point relate
+ to smallest point. Defaults to 0.5.
+
+ Returns:
+ np.ndarray: Minmax points.
+ """
+ if origin == 0.5:
+ return np.concatenate([centers - dims / 2, centers + dims / 2],
+ axis=-1)
+ corners = center_to_corner_box2d(centers, dims, origin=origin)
+ return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+ """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+
+ Args:
+ rbboxes (np.ndarray): Rotated bboxes with shape of \
+ (N, 5(x, y, xdim, ydim, rad)).
+
+ Returns:
+ np.ndarray: Bounding boxes with the shpae of
+ (N, 4(xmin, ymin, xmax, ymax)).
+ """
+ rots = rbboxes[..., -1]
+ rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
+ cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
+ bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+ bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+ return bboxes
+
+
+@numba.jit(nopython=True)
+def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
+ """Calculate box iou. Note that jit version runs ~10x faster than the
+ box_overlaps function in mmdet3d.core.evaluation.
+
+ Args:
+ boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
+ query_boxes (np.ndarray): Query boxes with shape of (K, 4).
+ mode (str, optional): IoU mode. Defaults to 'iou'.
+ eps (float, optional): Value added to denominator. Defaults to 0.
+
+ Returns:
+ np.ndarray: Overlap between boxes and query_boxes
+ with the shape of [N, K].
+ """
+ N = boxes.shape[0]
+ K = query_boxes.shape[0]
+ overlaps = np.zeros((N, K), dtype=boxes.dtype)
+ for k in range(K):
+ box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
+ (query_boxes[k, 3] - query_boxes[k, 1] + eps))
+ for n in range(N):
+ iw = (
+ min(boxes[n, 2], query_boxes[k, 2]) -
+ max(boxes[n, 0], query_boxes[k, 0]) + eps)
+ if iw > 0:
+ ih = (
+ min(boxes[n, 3], query_boxes[k, 3]) -
+ max(boxes[n, 1], query_boxes[k, 1]) + eps)
+ if ih > 0:
+ if mode == 'iou':
+ ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+ (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
+ iw * ih)
+ else:
+ ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+ (boxes[n, 3] - boxes[n, 1] + eps))
+ overlaps[n, k] = iw * ih / ua
+ return overlaps
+
+
+def projection_matrix_to_CRT_kitti(proj):
+ """Split projection matrix of kitti.
+
+ P = C @ [R|T]
+ C is upper triangular matrix, so we need to inverse CR and use QR
+ stable for all kitti camera projection matrix.
+
+ Args:
+ proj (p.array, shape=[4, 4]): Intrinsics of camera.
+
+ Returns:
+ tuple[np.ndarray]: Splited matrix of C, R and T.
+ """
+
+ CR = proj[0:3, 0:3]
+ CT = proj[0:3, 3]
+ RinvCinv = np.linalg.inv(CR)
+ Rinv, Cinv = np.linalg.qr(RinvCinv)
+ C = np.linalg.inv(Cinv)
+ R = np.linalg.inv(Rinv)
+ T = Cinv @ CT
+ return C, R, T
+
+
+def remove_outside_points(points, rect, Trv2c, P2, image_shape):
+ """Remove points which are outside of image.
+
+ Args:
+ points (np.ndarray, shape=[N, 3+dims]): Total points.
+ rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+ specific camera coordinate (e.g. CAM2) to CAM0.
+ Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
+ camera coordinate to lidar coordinate.
+ P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+ image_shape (list[int]): Shape of image.
+
+ Returns:
+ np.ndarray, shape=[N, 3+dims]: Filtered points.
+ """
+ # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+ C, R, T = projection_matrix_to_CRT_kitti(P2)
+ image_bbox = [0, 0, image_shape[1], image_shape[0]]
+ frustum = get_frustum(image_bbox, C)
+ frustum -= T
+ frustum = np.linalg.inv(R) @ frustum.T
+ frustum = camera_to_lidar(frustum.T, rect, Trv2c)
+ frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
+ indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
+ points = points[indices.reshape([-1])]
+ return points
+
+
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+ """Get frustum corners in camera coordinates.
+
+ Args:
+ bbox_image (list[int]): box in image coordinates.
+ C (np.ndarray): Intrinsics.
+ near_clip (float, optional): Nearest distance of frustum.
+ Defaults to 0.001.
+ far_clip (float, optional): Farthest distance of frustum.
+ Defaults to 100.
+
+ Returns:
+ np.ndarray, shape=[8, 3]: coordinates of frustum corners.
+ """
+ fku = C[0, 0]
+ fkv = -C[1, 1]
+ u0v0 = C[0:2, 2]
+ z_points = np.array(
+ [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
+ b = bbox_image
+ box_corners = np.array(
+ [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+ dtype=C.dtype)
+ near_box_corners = (box_corners - u0v0) / np.array(
+ [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+ far_box_corners = (box_corners - u0v0) / np.array(
+ [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+ ret_xy = np.concatenate([near_box_corners, far_box_corners],
+ axis=0) # [8, 2]
+ ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+ return ret_xyz
+
+
+def surface_equ_3d(polygon_surfaces):
+ """
+
+ Args:
+ polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+ [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
+ All surfaces' normal vector must direct to internal.
+ Max_num_points_of_surface must at least 3.
+
+ Returns:
+ tuple: normal vector and its direction.
+ """
+ # return [a, b, c], d in ax+by+cz+d=0
+ # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
+ surface_vec = polygon_surfaces[:, :, :2, :] - \
+ polygon_surfaces[:, :, 1:3, :]
+ # normal_vec: [..., 3]
+ normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
+ # print(normal_vec.shape, points[..., 0, :].shape)
+ # d = -np.inner(normal_vec, points[..., 0, :])
+ d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
+ return normal_vec, -d
+
+
+@numba.njit
+def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
+ num_surfaces):
+ """
+ Args:
+ points (np.ndarray): Input points with shape of (num_points, 3).
+ polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+ (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+ All surfaces' normal vector must direct to internal.
+ Max_num_points_of_surface must at least 3.
+ normal_vec (np.ndarray): Normal vector of polygon_surfaces.
+ d (int): Directions of normal vector.
+ num_surfaces (np.ndarray): Number of surfaces a polygon contains
+ shape of (num_polygon).
+
+ Returns:
+ np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+ """
+ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+ num_points = points.shape[0]
+ num_polygons = polygon_surfaces.shape[0]
+ ret = np.ones((num_points, num_polygons), dtype=np.bool_)
+ sign = 0.0
+ for i in range(num_points):
+ for j in range(num_polygons):
+ for k in range(max_num_surfaces):
+ if k > num_surfaces[j]:
+ break
+ sign = (
+ points[i, 0] * normal_vec[j, k, 0] +
+ points[i, 1] * normal_vec[j, k, 1] +
+ points[i, 2] * normal_vec[j, k, 2] + d[j, k])
+ if sign >= 0:
+ ret[i, j] = False
+ break
+ return ret
+
+
+def points_in_convex_polygon_3d_jit(points,
+ polygon_surfaces,
+ num_surfaces=None):
+ """Check points is in 3d convex polygons.
+
+ Args:
+ points (np.ndarray): Input points with shape of (num_points, 3).
+ polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+ (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+ All surfaces' normal vector must direct to internal.
+ Max_num_points_of_surface must at least 3.
+ num_surfaces (np.ndarray, optional): Number of surfaces a polygon
+ contains shape of (num_polygon). Defaults to None.
+
+ Returns:
+ np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+ """
+ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+ # num_points = points.shape[0]
+ num_polygons = polygon_surfaces.shape[0]
+ if num_surfaces is None:
+ num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
+ normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
+ # normal_vec: [num_polygon, max_num_surfaces, 3]
+ # d: [num_polygon, max_num_surfaces]
+ return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
+ normal_vec, d, num_surfaces)
+
+
+@numba.jit
+def points_in_convex_polygon_jit(points, polygon, clockwise=True):
+ """Check points is in 2d convex polygons. True when point in polygon.
+
+ Args:
+ points (np.ndarray): Input points with the shape of [num_points, 2].
+ polygon (np.ndarray): Input polygon with the shape of
+ [num_polygon, num_points_of_polygon, 2].
+ clockwise (bool, optional): Indicate polygon is clockwise. Defaults
+ to True.
+
+ Returns:
+ np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+ """
+ # first convert polygon to directed lines
+ num_points_of_polygon = polygon.shape[1]
+ num_points = points.shape[0]
+ num_polygons = polygon.shape[0]
+ # if clockwise:
+ # vec1 = polygon - polygon[:, [num_points_of_polygon - 1] +
+ # list(range(num_points_of_polygon - 1)), :]
+ # else:
+ # vec1 = polygon[:, [num_points_of_polygon - 1] +
+ # list(range(num_points_of_polygon - 1)), :] - polygon
+ # vec1: [num_polygon, num_points_of_polygon, 2]
+ vec1 = np.zeros((2), dtype=polygon.dtype)
+ ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
+ success = True
+ cross = 0.0
+ for i in range(num_points):
+ for j in range(num_polygons):
+ success = True
+ for k in range(num_points_of_polygon):
+ if clockwise:
+ vec1 = polygon[j, k] - polygon[j, k - 1]
+ else:
+ vec1 = polygon[j, k - 1] - polygon[j, k]
+ cross = vec1[1] * (polygon[j, k, 0] - points[i, 0])
+ cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1])
+ if cross >= 0:
+ success = False
+ break
+ ret[i, j] = success
+ return ret
+
+
+def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
+ """Convert kitti center boxes to corners.
+
+ 7 -------- 4
+ /| /|
+ 6 -------- 5 .
+ | | | |
+ . 3 -------- 0
+ |/ |/
+ 2 -------- 1
+
+ Args:
+ boxes3d (np.ndarray): Boxes with shape of (N, 7)
+ [x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry
+ in KITTI dataset.
+ bottom_center (bool, optional): Whether z is on the bottom center
+ of object. Defaults to True.
+
+ Returns:
+ np.ndarray: Box corners with the shape of [N, 8, 3].
+ """
+ boxes_num = boxes3d.shape[0]
+ w, l, h = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+ x_corners = np.array(
+ [w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.],
+ dtype=np.float32).T
+ y_corners = np.array(
+ [-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.],
+ dtype=np.float32).T
+ if bottom_center:
+ z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+ z_corners[:, 4:8] = h.reshape(boxes_num, 1).repeat(4, axis=1) # (N, 8)
+ else:
+ z_corners = np.array([
+ -h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2.
+ ],
+ dtype=np.float32).T
+
+ ry = boxes3d[:, 6]
+ zeros, ones = np.zeros(
+ ry.size, dtype=np.float32), np.ones(
+ ry.size, dtype=np.float32)
+ rot_list = np.array([[np.cos(ry), -np.sin(ry), zeros],
+ [np.sin(ry), np.cos(ry), zeros], [zeros, zeros,
+ ones]]) # (3, 3, N)
+ R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3)
+
+ temp_corners = np.concatenate((x_corners.reshape(
+ -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
+ axis=2) # (N, 8, 3)
+ rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3)
+ x_corners = rotated_corners[:, :, 0]
+ y_corners = rotated_corners[:, :, 1]
+ z_corners = rotated_corners[:, :, 2]
+
+ x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+ x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+ y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+ z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+ corners = np.concatenate(
+ (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
+ axis=2)
+
+ return corners.astype(np.float32)
diff --git a/mmcv/core/bbox/builder.py b/mmcv/core/bbox/builder.py
new file mode 100644
index 0000000..682683b
--- /dev/null
+++ b/mmcv/core/bbox/builder.py
@@ -0,0 +1,20 @@
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+
+
+def build_assigner(cfg, **default_args):
+ """Builder of box assigner."""
+ return build_from_cfg(cfg, BBOX_ASSIGNERS, default_args)
+
+
+def build_sampler(cfg, **default_args):
+ """Builder of box sampler."""
+ return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
+
+
+def build_bbox_coder(cfg, **default_args):
+ """Builder of box coder."""
+ return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/mmcv/core/bbox/coder/__init__.py b/mmcv/core/bbox/coder/__init__.py
new file mode 100644
index 0000000..ab2e6be
--- /dev/null
+++ b/mmcv/core/bbox/coder/__init__.py
@@ -0,0 +1,11 @@
+from .nms_free_coder import NMSFreeCoder
+from .detr3d_track_coder import DETRTrack3DCoder
+from mmcv.core.bbox import build_bbox_coder
+from .fut_nms_free_coder import CustomNMSFreeCoder
+from .map_nms_free_coder import MapNMSFreeCoder
+
+__all__ = [
+ 'build_bbox_coder',
+ 'NMSFreeCoder', 'DETRTrack3DCoder',
+ 'CustomNMSFreeCoder','MapNMSFreeCoder'
+]
diff --git a/mmcv/core/bbox/coder/base_bbox_coder.py b/mmcv/core/bbox/coder/base_bbox_coder.py
new file mode 100644
index 0000000..cf0b34c
--- /dev/null
+++ b/mmcv/core/bbox/coder/base_bbox_coder.py
@@ -0,0 +1,17 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+ """Base bounding box coder."""
+
+ def __init__(self, **kwargs):
+ pass
+
+ @abstractmethod
+ def encode(self, bboxes, gt_bboxes):
+ """Encode deltas between bboxes and ground truth boxes."""
+
+ @abstractmethod
+ def decode(self, bboxes, bboxes_pred):
+ """Decode the predicted bboxes according to prediction and base
+ boxes."""
diff --git a/mmcv/core/bbox/coder/detr3d_track_coder.py b/mmcv/core/bbox/coder/detr3d_track_coder.py
new file mode 100755
index 0000000..1c0e017
--- /dev/null
+++ b/mmcv/core/bbox/coder/detr3d_track_coder.py
@@ -0,0 +1,156 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from ..util import normalize_bbox, denormalize_bbox
+from ..structures.utils import xywhr2xyxyr
+from mmcv.ops.iou3d import nms_bev
+
+@BBOX_CODERS.register_module()
+class DETRTrack3DCoder(BaseBBoxCoder):
+ """Bbox coder for DETR3D.
+ Args:
+ pc_range (list[float]): Range of point cloud.
+ post_center_range (list[float]): Limit of the center.
+ Default: None.
+ max_num (int): Max number to be kept. Default: 100.
+ score_threshold (float): Threshold to filter boxes based on score.
+ Default: None.
+ code_size (int): Code size of bboxes. Default: 9
+ """
+
+ def __init__(self,
+ pc_range,
+ post_center_range=None,
+ max_num=100,
+ score_threshold=0.2,
+ num_classes=7,
+ with_nms=False,
+ iou_thres=0.3):
+
+ self.pc_range = pc_range
+ self.post_center_range = post_center_range
+ self.max_num = max_num
+ self.score_threshold = score_threshold
+ self.num_classes = num_classes
+ self.with_nms = with_nms
+ self.nms_iou_thres = iou_thres
+
+ def encode(self):
+ pass
+
+ def decode_single(self, cls_scores, bbox_preds,
+ track_scores, obj_idxes, with_mask=True, img_metas=None):
+ """Decode bboxes.
+ Args:
+ cls_scores (Tensor): Outputs from the classification head, \
+ shape [num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ bbox_preds (Tensor): Outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [num_query, 9].
+
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ max_num = self.max_num
+ max_num = min(cls_scores.size(0), self.max_num)
+
+ cls_scores = cls_scores.sigmoid()
+ _, indexs = cls_scores.max(dim=-1)
+ labels = indexs % self.num_classes
+
+ _, bbox_index = track_scores.topk(max_num)
+
+ labels = labels[bbox_index]
+ bbox_preds = bbox_preds[bbox_index]
+ track_scores = track_scores[bbox_index]
+ obj_idxes = obj_idxes[bbox_index]
+
+ scores = track_scores
+
+ final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
+ final_scores = track_scores
+ final_preds = labels
+
+ # use score threshold
+ if self.score_threshold is not None:
+ thresh_mask = final_scores > self.score_threshold
+
+ if self.with_nms:
+ boxes_for_nms = xywhr2xyxyr(img_metas[0]['box_type_3d'](final_box_preds[:, :], 9).bev)
+ nms_mask = boxes_for_nms.new_zeros(boxes_for_nms.shape[0]) > 0
+ # print(self.nms_iou_thres)
+ try:
+ selected = nms_bev(
+ boxes_for_nms,
+ final_scores,
+ thresh=self.nms_iou_thres)
+ nms_mask[selected] = True
+ except:
+ print('Error', boxes_for_nms, final_scores)
+ nms_mask = boxes_for_nms.new_ones(boxes_for_nms.shape[0]) > 0
+ if self.post_center_range is not None:
+ self.post_center_range = torch.tensor(
+ self.post_center_range, device=scores.device)
+ mask = (final_box_preds[..., :3] >=
+ self.post_center_range[:3]).all(1)
+ mask &= (final_box_preds[..., :3] <=
+ self.post_center_range[3:]).all(1)
+
+ if self.score_threshold:
+ mask &= thresh_mask
+ if not with_mask:
+ mask = torch.ones_like(mask) > 0
+ if self.with_nms:
+ mask &= nms_mask
+
+ boxes3d = final_box_preds[mask]
+ scores = final_scores[mask]
+ labels = final_preds[mask]
+ track_scores = track_scores[mask]
+ obj_idxes = obj_idxes[mask]
+ predictions_dict = {
+ 'bboxes': boxes3d,
+ 'scores': scores,
+ 'labels': labels,
+ 'track_scores': track_scores,
+ 'obj_idxes': obj_idxes,
+ 'bbox_index': bbox_index,
+ 'mask': mask
+ }
+
+ else:
+ raise NotImplementedError(
+ 'Need to reorganize output as a batch, only '
+ 'support post_center_range is not None for now!')
+ return predictions_dict
+
+ def decode(self, preds_dicts, with_mask=True, img_metas=None):
+ """Decode bboxes.
+ Args:
+ cls_scores (Tensor): Outputs from the classification head, \
+ shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ Note: before sigmoid!
+ bbox_preds (Tensor): Sigmoid outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [nb_dec, bs, num_query, 9].
+
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ all_cls_scores = preds_dicts['cls_scores']
+ all_bbox_preds = preds_dicts['bbox_preds']
+ track_scores = preds_dicts['track_scores']
+ obj_idxes = preds_dicts['obj_idxes']
+
+ batch_size = all_cls_scores.size()[0]
+ predictions_list = []
+ # bs size = 1
+ predictions_list.append(self.decode_single(
+ all_cls_scores, all_bbox_preds,
+ track_scores, obj_idxes, with_mask, img_metas))
+ #for i in range(batch_size):
+ # predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+ return predictions_list
diff --git a/mmcv/core/bbox/coder/fut_nms_free_coder.py b/mmcv/core/bbox/coder/fut_nms_free_coder.py
new file mode 100644
index 0000000..b8a8a95
--- /dev/null
+++ b/mmcv/core/bbox/coder/fut_nms_free_coder.py
@@ -0,0 +1,127 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from mmcv.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class CustomNMSFreeCoder(BaseBBoxCoder):
+ """Bbox coder for NMS-free detector.
+ Args:
+ pc_range (list[float]): Range of point cloud.
+ post_center_range (list[float]): Limit of the center.
+ Default: None.
+ max_num (int): Max number to be kept. Default: 100.
+ score_threshold (float): Threshold to filter boxes based on score.
+ Default: None.
+ code_size (int): Code size of bboxes. Default: 9
+ """
+
+ def __init__(self,
+ pc_range,
+ voxel_size=None,
+ post_center_range=None,
+ max_num=100,
+ score_threshold=None,
+ num_classes=10):
+ self.pc_range = pc_range
+ self.voxel_size = voxel_size
+ self.post_center_range = post_center_range
+ self.max_num = max_num
+ self.score_threshold = score_threshold
+ self.num_classes = num_classes
+
+ def encode(self):
+
+ pass
+
+ def decode_single(self, cls_scores, bbox_preds, traj_preds):
+ """Decode bboxes.
+ Args:
+ cls_scores (Tensor): Outputs from the classification head, \
+ shape [num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ bbox_preds (Tensor): Outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [num_query, 9].
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ max_num = self.max_num
+
+ cls_scores = cls_scores.sigmoid()
+ scores, indexs = cls_scores.view(-1).topk(max_num)
+ labels = indexs % self.num_classes
+ bbox_index = indexs // self.num_classes
+ bbox_preds = bbox_preds[bbox_index]
+ traj_preds = traj_preds[bbox_index]
+
+ final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
+ final_scores = scores
+ final_preds = labels
+ final_traj_preds = traj_preds
+
+ # use score threshold
+ if self.score_threshold is not None:
+ thresh_mask = final_scores > self.score_threshold
+ tmp_score = self.score_threshold
+ while thresh_mask.sum() == 0:
+ tmp_score *= 0.9
+ if tmp_score < 0.01:
+ thresh_mask = final_scores > -1
+ break
+ thresh_mask = final_scores >= tmp_score
+
+ if self.post_center_range is not None:
+ self.post_center_range = torch.tensor(
+ self.post_center_range, device=scores.device)
+ mask = (final_box_preds[..., :3] >=
+ self.post_center_range[:3]).all(1)
+ mask &= (final_box_preds[..., :3] <=
+ self.post_center_range[3:]).all(1)
+
+ if self.score_threshold:
+ mask &= thresh_mask
+
+ boxes3d = final_box_preds[mask]
+ scores = final_scores[mask]
+ labels = final_preds[mask]
+ trajs = final_traj_preds[mask]
+
+ predictions_dict = {
+ 'bboxes': boxes3d,
+ 'scores': scores,
+ 'labels': labels,
+ 'trajs': trajs
+ }
+
+ else:
+ raise NotImplementedError(
+ 'Need to reorganize output as a batch, only '
+ 'support post_center_range is not None for now!')
+ return predictions_dict
+
+ def decode(self, preds_dicts):
+ """Decode bboxes.
+ Args:
+ all_cls_scores (Tensor): Outputs from the classification head, \
+ shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [nb_dec, bs, num_query, 9].
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ all_cls_scores = preds_dicts['all_cls_scores'][-1]
+ all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+ all_traj_preds = preds_dicts['all_traj_preds'][-1]
+
+ batch_size = all_cls_scores.size()[0]
+ predictions_list = []
+ for i in range(batch_size):
+ predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i], all_traj_preds[i]))
+ return predictions_list
+
diff --git a/mmcv/core/bbox/coder/map_nms_free_coder.py b/mmcv/core/bbox/coder/map_nms_free_coder.py
new file mode 100644
index 0000000..f20d300
--- /dev/null
+++ b/mmcv/core/bbox/coder/map_nms_free_coder.py
@@ -0,0 +1,126 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from mmcv.models.vad_utils.map_utils import (
+ denormalize_2d_pts, denormalize_2d_bbox
+)
+
+
+@BBOX_CODERS.register_module()
+class MapNMSFreeCoder(BaseBBoxCoder):
+ """Bbox coder for NMS-free detector.
+ Args:
+ pc_range (list[float]): Range of point cloud.
+ post_center_range (list[float]): Limit of the center.
+ Default: None.
+ max_num (int): Max number to be kept. Default: 100.
+ score_threshold (float): Threshold to filter boxes based on score.
+ Default: None.
+ code_size (int): Code size of bboxes. Default: 9
+ """
+
+ def __init__(self,
+ pc_range,
+ voxel_size=None,
+ post_center_range=None,
+ max_num=100,
+ score_threshold=None,
+ num_classes=10):
+ self.pc_range = pc_range
+ self.voxel_size = voxel_size
+ self.post_center_range = post_center_range
+ self.max_num = max_num
+ self.score_threshold = score_threshold
+ self.num_classes = num_classes
+
+ def encode(self):
+
+ pass
+
+ def decode_single(self, cls_scores, bbox_preds, pts_preds):
+ """Decode bboxes.
+ Args:
+ cls_scores (Tensor): Outputs from the classification head, \
+ shape [num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ bbox_preds (Tensor): Outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [num_query, 9].
+ pts_preds (Tensor):
+ Shape [num_query, fixed_num_pts, 2]
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ max_num = self.max_num
+
+ cls_scores = cls_scores.sigmoid()
+ scores, indexs = cls_scores.view(-1).topk(max_num)
+ labels = indexs % self.num_classes
+ bbox_index = indexs // self.num_classes
+ bbox_preds = bbox_preds[bbox_index]
+ pts_preds = pts_preds[bbox_index]
+
+ final_box_preds = denormalize_2d_bbox(bbox_preds, self.pc_range)
+ final_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) #num_q,num_p,2
+ # final_box_preds = bbox_preds
+ final_scores = scores
+ final_preds = labels
+
+ # use score threshold
+ if self.score_threshold is not None:
+ thresh_mask = final_scores > self.score_threshold
+ tmp_score = self.score_threshold
+ while thresh_mask.sum() == 0:
+ tmp_score *= 0.9
+ if tmp_score < 0.01:
+ thresh_mask = final_scores > -1
+ break
+ thresh_mask = final_scores >= tmp_score
+
+ if self.post_center_range is not None:
+ self.post_center_range = torch.tensor(
+ self.post_center_range, device=scores.device)
+ mask = (final_box_preds[..., :4] >= self.post_center_range[:4]).all(1)
+ mask &= (final_box_preds[..., :4] <= self.post_center_range[4:]).all(1)
+
+ if self.score_threshold:
+ mask &= thresh_mask
+
+ boxes3d = final_box_preds[mask]
+ scores = final_scores[mask]
+ pts = final_pts_preds[mask]
+ labels = final_preds[mask]
+ predictions_dict = {
+ 'map_bboxes': boxes3d,
+ 'map_scores': scores,
+ 'map_labels': labels,
+ 'map_pts': pts,
+ }
+
+ else:
+ raise NotImplementedError(
+ 'Need to reorganize output as a batch, only '
+ 'support post_center_range is not None for now!')
+ return predictions_dict
+
+ def decode(self, preds_dicts):
+ """Decode bboxes.
+ Args:
+ all_cls_scores (Tensor): Outputs from the classification head, \
+ shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [nb_dec, bs, num_query, 9].
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ all_cls_scores = preds_dicts['map_all_cls_scores'][-1]
+ all_bbox_preds = preds_dicts['map_all_bbox_preds'][-1]
+ all_pts_preds = preds_dicts['map_all_pts_preds'][-1]
+ batch_size = all_cls_scores.size()[0]
+ predictions_list = []
+ for i in range(batch_size):
+ predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i],all_pts_preds[i]))
+ return predictions_list
\ No newline at end of file
diff --git a/mmcv/core/bbox/coder/nms_free_coder.py b/mmcv/core/bbox/coder/nms_free_coder.py
new file mode 100755
index 0000000..95430bc
--- /dev/null
+++ b/mmcv/core/bbox/coder/nms_free_coder.py
@@ -0,0 +1,124 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from mmcv.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+ """Bbox coder for NMS-free detector.
+ Args:
+ pc_range (list[float]): Range of point cloud.
+ post_center_range (list[float]): Limit of the center.
+ Default: None.
+ max_num (int): Max number to be kept. Default: 100.
+ score_threshold (float): Threshold to filter boxes based on score.
+ Default: None.
+ code_size (int): Code size of bboxes. Default: 9
+ """
+
+ def __init__(self,
+ pc_range,
+ voxel_size=None,
+ post_center_range=None,
+ max_num=100,
+ score_threshold=None,
+ num_classes=10):
+ self.pc_range = pc_range
+ self.voxel_size = voxel_size
+ self.post_center_range = post_center_range
+ self.max_num = max_num
+ self.score_threshold = score_threshold
+ self.num_classes = num_classes
+
+ def encode(self):
+
+ pass
+
+ def decode_single(self, cls_scores, bbox_preds):
+ """Decode bboxes.
+ Args:
+ cls_scores (Tensor): Outputs from the classification head, \
+ shape [num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ bbox_preds (Tensor): Outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [num_query, 9].
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ max_num = min(self.max_num, cls_scores.shape[0])
+
+ cls_scores = cls_scores.sigmoid()
+ scores, indexs = cls_scores.view(-1).topk(max_num)
+ labels = indexs % self.num_classes
+ bbox_index = indexs // self.num_classes
+ bbox_preds = bbox_preds[bbox_index]
+
+ final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)
+ final_scores = scores
+ final_preds = labels
+
+ # use score threshold
+ if self.score_threshold is not None:
+ thresh_mask = final_scores > self.score_threshold
+ tmp_score = self.score_threshold
+ while thresh_mask.sum() == 0:
+ tmp_score *= 0.9
+ if tmp_score < 0.01:
+ thresh_mask = final_scores > -1
+ break
+ thresh_mask = final_scores >= tmp_score
+
+ if self.post_center_range is not None:
+ self.post_center_range = torch.tensor(
+ self.post_center_range, device=scores.device)
+ mask = (final_box_preds[..., :3] >=
+ self.post_center_range[:3]).all(1)
+ mask &= (final_box_preds[..., :3] <=
+ self.post_center_range[3:]).all(1)
+
+ if self.score_threshold:
+ mask &= thresh_mask
+
+ boxes3d = final_box_preds[mask]
+ scores = final_scores[mask]
+
+ labels = final_preds[mask]
+ predictions_dict = {
+ 'bboxes': boxes3d,
+ 'scores': scores,
+ 'labels': labels,
+ 'mask': mask,
+ 'bbox_index': bbox_index
+ }
+
+ else:
+ raise NotImplementedError(
+ 'Need to reorganize output as a batch, only '
+ 'support post_center_range is not None for now!')
+ return predictions_dict
+
+ def decode(self, preds_dicts):
+ """Decode bboxes.
+ Args:
+ all_cls_scores (Tensor): Outputs from the classification head, \
+ shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+ cls_out_channels should includes background.
+ all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+ Shape [nb_dec, bs, num_query, 9].
+ Returns:
+ list[dict]: Decoded boxes.
+ """
+ all_cls_scores = preds_dicts['all_cls_scores'][-1]
+ all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+
+ batch_size = all_cls_scores.size()[0]
+ predictions_list = []
+ for i in range(batch_size):
+ predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+ return predictions_list
+
diff --git a/mmcv/core/bbox/iou_calculators/__init__.py b/mmcv/core/bbox/iou_calculators/__init__.py
new file mode 100644
index 0000000..3c13f41
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/__init__.py
@@ -0,0 +1,11 @@
+from .builder import build_iou_calculator
+from .iou2d_calculator import BboxOverlaps2D, bbox_overlaps
+from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+ BboxOverlapsNearest3D,
+ axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+ bbox_overlaps_nearest_3d)
+
+__all__ = ['build_iou_calculator', 'BboxOverlaps2D', 'bbox_overlaps',
+ 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+ 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+ 'axis_aligned_bbox_overlaps_3d']
diff --git a/mmcv/core/bbox/iou_calculators/builder.py b/mmcv/core/bbox/iou_calculators/builder.py
new file mode 100644
index 0000000..09094d7
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry, build_from_cfg
+
+IOU_CALCULATORS = Registry('IoU calculator')
+
+
+def build_iou_calculator(cfg, default_args=None):
+ """Builder of IoU calculator."""
+ return build_from_cfg(cfg, IOU_CALCULATORS, default_args)
diff --git a/mmcv/core/bbox/iou_calculators/iou2d_calculator.py b/mmcv/core/bbox/iou_calculators/iou2d_calculator.py
new file mode 100644
index 0000000..25f2b46
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/iou2d_calculator.py
@@ -0,0 +1,260 @@
+import torch
+
+from .builder import IOU_CALCULATORS
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+ if dtype == 'fp16':
+ # scale is for preventing overflows
+ x = (x / scale).half()
+ return x
+
+
+def fp16_clamp(x, min=None, max=None):
+ if not x.is_cuda and x.dtype == torch.float16:
+ # clamp for cpu float16, tensor fp16 has no clamp implementation
+ return x.float().clamp(min, max).half()
+
+ return x.clamp(min, max)
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps2D:
+ """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+ def __init__(self, scale=1., dtype=None):
+ self.scale = scale
+ self.dtype = dtype
+
+ def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+ """Calculate IoU between 2D bboxes.
+
+ Args:
+ bboxes1 (Tensor): bboxes have shape (m, 4) in
+ format, or shape (m, 5) in format.
+ bboxes2 (Tensor): bboxes have shape (m, 4) in
+ format, shape (m, 5) in format, or be
+ empty. If ``is_aligned `` is ``True``, then m and n must be
+ equal.
+ mode (str): "iou" (intersection over union), "iof" (intersection
+ over foreground), or "giou" (generalized intersection over
+ union).
+ is_aligned (bool, optional): If True, then m and n must be equal.
+ Default False.
+
+ Returns:
+ Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+ """
+ assert bboxes1.size(-1) in [0, 4, 5]
+ assert bboxes2.size(-1) in [0, 4, 5]
+ if bboxes2.size(-1) == 5:
+ bboxes2 = bboxes2[..., :4]
+ if bboxes1.size(-1) == 5:
+ bboxes1 = bboxes1[..., :4]
+
+ if self.dtype == 'fp16':
+ # change tensor type to save cpu and cuda memory and keep speed
+ bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+ bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+ overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+ if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+ # resume cpu float32
+ overlaps = overlaps.float()
+ return overlaps
+
+ return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+ def __repr__(self):
+ """str: a string describing the module"""
+ repr_str = self.__class__.__name__ + f'(' \
+ f'scale={self.scale}, dtype={self.dtype})'
+ return repr_str
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+ """Calculate overlap between two set of bboxes.
+
+ FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+ Note:
+ Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+ there are some new generated variable when calculating IOU
+ using bbox_overlaps function:
+
+ 1) is_aligned is False
+ area1: M x 1
+ area2: N x 1
+ lt: M x N x 2
+ rb: M x N x 2
+ wh: M x N x 2
+ overlap: M x N x 1
+ union: M x N x 1
+ ious: M x N x 1
+
+ Total memory:
+ S = (9 x N x M + N + M) * 4 Byte,
+
+ When using FP16, we can reduce:
+ R = (9 x N x M + N + M) * 4 / 2 Byte
+ R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+ Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+ N + 1 < 3 * N, when N or M is 1.
+
+ Given M = 40 (ground truth), N = 400000 (three anchor boxes
+ in per grid, FPN, R-CNNs),
+ R = 275 MB (one times)
+
+ A special case (dense detection), M = 512 (ground truth),
+ R = 3516 MB = 3.43 GB
+
+ When the batch size is B, reduce:
+ B x R
+
+ Therefore, CUDA memory runs out frequently.
+
+ Experiments on GeForce RTX 2080Ti (11019 MiB):
+
+ | dtype | M | N | Use | Real | Ideal |
+ |:----:|:----:|:----:|:----:|:----:|:----:|
+ | FP32 | 512 | 400000 | 8020 MiB | -- | -- |
+ | FP16 | 512 | 400000 | 4504 MiB | 3516 MiB | 3516 MiB |
+ | FP32 | 40 | 400000 | 1540 MiB | -- | -- |
+ | FP16 | 40 | 400000 | 1264 MiB | 276MiB | 275 MiB |
+
+ 2) is_aligned is True
+ area1: N x 1
+ area2: N x 1
+ lt: N x 2
+ rb: N x 2
+ wh: N x 2
+ overlap: N x 1
+ union: N x 1
+ ious: N x 1
+
+ Total memory:
+ S = 11 x N * 4 Byte
+
+ When using FP16, we can reduce:
+ R = 11 x N * 4 / 2 Byte
+
+ So do the 'giou' (large than 'iou').
+
+ Time-wise, FP16 is generally faster than FP32.
+
+ When gpu_assign_thr is not -1, it takes more time on cpu
+ but not reduce memory.
+ There, we can reduce half the memory and keep the speed.
+
+ If ``is_aligned `` is ``False``, then calculate the overlaps between each
+ bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+ pair of bboxes1 and bboxes2.
+
+ Args:
+ bboxes1 (Tensor): shape (B, m, 4) in format or empty.
+ bboxes2 (Tensor): shape (B, n, 4) in format or empty.
+ B indicates the batch dim, in shape (B1, B2, ..., Bn).
+ If ``is_aligned `` is ``True``, then m and n must be equal.
+ mode (str): "iou" (intersection over union), "iof" (intersection over
+ foreground) or "giou" (generalized intersection over union).
+ Default "iou".
+ is_aligned (bool, optional): If True, then m and n must be equal.
+ Default False.
+ eps (float, optional): A value added to the denominator for numerical
+ stability. Default 1e-6.
+
+ Returns:
+ Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+
+ Example:
+ >>> bboxes1 = torch.FloatTensor([
+ >>> [0, 0, 10, 10],
+ >>> [10, 10, 20, 20],
+ >>> [32, 32, 38, 42],
+ >>> ])
+ >>> bboxes2 = torch.FloatTensor([
+ >>> [0, 0, 10, 20],
+ >>> [0, 10, 10, 19],
+ >>> [10, 10, 20, 20],
+ >>> ])
+ >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+ >>> assert overlaps.shape == (3, 3)
+ >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+ >>> assert overlaps.shape == (3, )
+
+ Example:
+ >>> empty = torch.empty(0, 4)
+ >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+ >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+ >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+ >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+ """
+
+ assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+ # Either the boxes are empty or the length of boxes' last dimension is 4
+ assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+ assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+ # Batch dim must be the same
+ # Batch dim: (B1, B2, ... Bn)
+ assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+ batch_shape = bboxes1.shape[:-2]
+
+ rows = bboxes1.size(-2)
+ cols = bboxes2.size(-2)
+ if is_aligned:
+ assert rows == cols
+
+ if rows * cols == 0:
+ if is_aligned:
+ return bboxes1.new(batch_shape + (rows, ))
+ else:
+ return bboxes1.new(batch_shape + (rows, cols))
+
+ area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+ bboxes1[..., 3] - bboxes1[..., 1])
+ area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+ bboxes2[..., 3] - bboxes2[..., 1])
+
+ if is_aligned:
+ lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
+ rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]
+
+ wh = fp16_clamp(rb - lt, min=0)
+ overlap = wh[..., 0] * wh[..., 1]
+
+ if mode in ['iou', 'giou']:
+ union = area1 + area2 - overlap
+ else:
+ union = area1
+ if mode == 'giou':
+ enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+ enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+ else:
+ lt = torch.max(bboxes1[..., :, None, :2],
+ bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
+ rb = torch.min(bboxes1[..., :, None, 2:],
+ bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]
+
+ wh = fp16_clamp(rb - lt, min=0)
+ overlap = wh[..., 0] * wh[..., 1]
+
+ if mode in ['iou', 'giou']:
+ union = area1[..., None] + area2[..., None, :] - overlap
+ else:
+ union = area1[..., None]
+ if mode == 'giou':
+ enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+ bboxes2[..., None, :, :2])
+ enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+ bboxes2[..., None, :, 2:])
+
+ eps = union.new_tensor([eps])
+ union = torch.max(union, eps)
+ ious = overlap / union
+ if mode in ['iou', 'iof']:
+ return ious
+ # calculate gious
+ enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+ enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+ enclose_area = torch.max(enclose_area, eps)
+ gious = ious - (enclose_area - union) / enclose_area
+ return gious
diff --git a/mmcv/core/bbox/iou_calculators/iou3d_calculator.py b/mmcv/core/bbox/iou_calculators/iou3d_calculator.py
new file mode 100644
index 0000000..5bc00b4
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/iou3d_calculator.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .iou2d_calculator import bbox_overlaps
+from .builder import IOU_CALCULATORS
+from ..structures.utils import get_box_type
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlapsNearest3D(object):
+ """Nearest 3D IoU Calculator.
+
+ Note:
+ This IoU calculator first finds the nearest 2D boxes in bird eye view
+ (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+
+ Args:
+ coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
+ """
+
+ def __init__(self, coordinate='lidar'):
+ assert coordinate in ['camera', 'lidar', 'depth']
+ self.coordinate = coordinate
+
+ def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+ """Calculate nearest 3D IoU.
+
+ Note:
+ If ``is_aligned`` is ``False``, then it calculates the ious between
+ each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
+ between each aligned pair of bboxes1 and bboxes2.
+
+ Args:
+ bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, h, w, l, ry, v].
+ bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, h, w, l, ry, v].
+ mode (str): "iou" (intersection over union) or iof
+ (intersection over foreground).
+ is_aligned (bool): Whether the calculation is aligned.
+
+ Return:
+ torch.Tensor: If ``is_aligned`` is ``True``, return ious between \
+ bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \
+ ``False``, return shape is M.
+ """
+ return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
+ self.coordinate)
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(coordinate={self.coordinate}'
+ return repr_str
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps3D(object):
+ """3D IoU Calculator.
+
+ Args:
+ coordinate (str): The coordinate system, valid options are
+ 'camera', 'lidar', and 'depth'.
+ """
+
+ def __init__(self, coordinate):
+ assert coordinate in ['camera', 'lidar', 'depth']
+ self.coordinate = coordinate
+
+ def __call__(self, bboxes1, bboxes2, mode='iou'):
+ """Calculate 3D IoU using cuda implementation.
+
+ Note:
+ This function calculate the IoU of 3D boxes based on their volumes.
+ IoU calculator ``:class:BboxOverlaps3D`` uses this function to
+ calculate the actual 3D IoUs of boxes.
+
+ Args:
+ bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry].
+ bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry].
+ mode (str): "iou" (intersection over union) or
+ iof (intersection over foreground).
+
+ Return:
+ torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \
+ with shape (M, N) (aligned mode is not supported currently).
+ """
+ return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
+
+ def __repr__(self):
+ """str: return a string that describes the module"""
+ repr_str = self.__class__.__name__
+ repr_str += f'(coordinate={self.coordinate}'
+ return repr_str
+
+
+def bbox_overlaps_nearest_3d(bboxes1,
+ bboxes2,
+ mode='iou',
+ is_aligned=False,
+ coordinate='lidar'):
+ """Calculate nearest 3D IoU.
+
+ Note:
+ This function first finds the nearest 2D boxes in bird eye view
+ (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+ Ths IoU calculator :class:`BboxOverlapsNearest3D` uses this
+ function to calculate IoUs of boxes.
+
+ If ``is_aligned`` is ``False``, then it calculates the ious between
+ each bbox of bboxes1 and bboxes2, otherwise the ious between each
+ aligned pair of bboxes1 and bboxes2.
+
+ Args:
+ bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry, v].
+ bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry, v].
+ mode (str): "iou" (intersection over union) or iof
+ (intersection over foreground).
+ is_aligned (bool): Whether the calculation is aligned
+
+ Return:
+ torch.Tensor: If ``is_aligned`` is ``True``, return ious between \
+ bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \
+ ``False``, return shape is M.
+ """
+ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+ box_type, _ = get_box_type(coordinate)
+
+ bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+ bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+ # Change the bboxes to bev
+ # box conversion and iou calculation in torch version on CUDA
+ # is 10x faster than that in numpy version
+ bboxes1_bev = bboxes1.nearest_bev
+ bboxes2_bev = bboxes2.nearest_bev
+
+ ret = bbox_overlaps(
+ bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned)
+ return ret
+
+
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
+ """Calculate 3D IoU using cuda implementation.
+
+ Note:
+ This function calculates the IoU of 3D boxes based on their volumes.
+ IoU calculator :class:`BboxOverlaps3D` uses this function to
+ calculate the actual IoUs of boxes.
+
+ Args:
+ bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry].
+ bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry].
+ mode (str): "iou" (intersection over union) or
+ iof (intersection over foreground).
+ coordinate (str): 'camera' or 'lidar' coordinate system.
+
+ Return:
+ torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \
+ with shape (M, N) (aligned mode is not supported currently).
+ """
+ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+ box_type, _ = get_box_type(coordinate)
+
+ bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+ bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+ return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
+
+
+@IOU_CALCULATORS.register_module()
+class AxisAlignedBboxOverlaps3D(object):
+ """Axis-aligned 3D Overlaps (IoU) Calculator."""
+
+ def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+ """Calculate IoU between 2D bboxes.
+
+ Args:
+ bboxes1 (Tensor): shape (B, m, 6) in
+ format or empty.
+ bboxes2 (Tensor): shape (B, n, 6) in
+ format or empty.
+ B indicates the batch dim, in shape (B1, B2, ..., Bn).
+ If ``is_aligned`` is ``True``, then m and n must be equal.
+ mode (str): "iou" (intersection over union) or "giou" (generalized
+ intersection over union).
+ is_aligned (bool, optional): If True, then m and n must be equal.
+ Default False.
+ Returns:
+ Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+ """
+ assert bboxes1.size(-1) == bboxes2.size(-1) == 6
+ return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
+ is_aligned)
+
+ def __repr__(self):
+ """str: a string describing the module"""
+ repr_str = self.__class__.__name__ + '()'
+ return repr_str
+
+
+def axis_aligned_bbox_overlaps_3d(bboxes1,
+ bboxes2,
+ mode='iou',
+ is_aligned=False,
+ eps=1e-6):
+ """Calculate overlap between two set of axis aligned 3D bboxes. If
+ ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
+ of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
+ bboxes1 and bboxes2.
+
+ Args:
+ bboxes1 (Tensor): shape (B, m, 6) in
+ format or empty.
+ bboxes2 (Tensor): shape (B, n, 6) in
+ format or empty.
+ B indicates the batch dim, in shape (B1, B2, ..., Bn).
+ If ``is_aligned`` is ``True``, then m and n must be equal.
+ mode (str): "iou" (intersection over union) or "giou" (generalized
+ intersection over union).
+ is_aligned (bool, optional): If True, then m and n must be equal.
+ Default False.
+ eps (float, optional): A value added to the denominator for numerical
+ stability. Default 1e-6.
+
+ Returns:
+ Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+ Example:
+ >>> bboxes1 = torch.FloatTensor([
+ >>> [0, 0, 0, 10, 10, 10],
+ >>> [10, 10, 10, 20, 20, 20],
+ >>> [32, 32, 32, 38, 40, 42],
+ >>> ])
+ >>> bboxes2 = torch.FloatTensor([
+ >>> [0, 0, 0, 10, 20, 20],
+ >>> [0, 10, 10, 10, 19, 20],
+ >>> [10, 10, 10, 20, 20, 20],
+ >>> ])
+ >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
+ >>> assert overlaps.shape == (3, 3)
+ >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+ >>> assert overlaps.shape == (3, )
+ Example:
+ >>> empty = torch.empty(0, 6)
+ >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
+ >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+ >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+ >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+ """
+
+ assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
+ # Either the boxes are empty or the length of boxes's last dimenstion is 6
+ assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
+ assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
+
+ # Batch dim must be the same
+ # Batch dim: (B1, B2, ... Bn)
+ assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+ batch_shape = bboxes1.shape[:-2]
+
+ rows = bboxes1.size(-2)
+ cols = bboxes2.size(-2)
+ if is_aligned:
+ assert rows == cols
+
+ if rows * cols == 0:
+ if is_aligned:
+ return bboxes1.new(batch_shape + (rows, ))
+ else:
+ return bboxes1.new(batch_shape + (rows, cols))
+
+ area1 = (bboxes1[..., 3] -
+ bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * (
+ bboxes1[..., 5] - bboxes1[..., 2])
+ area2 = (bboxes2[..., 3] -
+ bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * (
+ bboxes2[..., 5] - bboxes2[..., 2])
+
+ if is_aligned:
+ lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3]
+ rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3]
+
+ wh = (rb - lt).clamp(min=0) # [B, rows, 2]
+ overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+ if mode in ['iou', 'giou']:
+ union = area1 + area2 - overlap
+ else:
+ union = area1
+ if mode == 'giou':
+ enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
+ enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
+ else:
+ lt = torch.max(bboxes1[..., :, None, :3],
+ bboxes2[..., None, :, :3]) # [B, rows, cols, 3]
+ rb = torch.min(bboxes1[..., :, None, 3:],
+ bboxes2[..., None, :, 3:]) # [B, rows, cols, 3]
+
+ wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3]
+ overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+ if mode in ['iou', 'giou']:
+ union = area1[..., None] + area2[..., None, :] - overlap
+ if mode == 'giou':
+ enclosed_lt = torch.min(bboxes1[..., :, None, :3],
+ bboxes2[..., None, :, :3])
+ enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
+ bboxes2[..., None, :, 3:])
+
+ eps = union.new_tensor([eps])
+ union = torch.max(union, eps)
+ ious = overlap / union
+ if mode in ['iou']:
+ return ious
+ # calculate gious
+ enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+ enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
+ enclose_area = torch.max(enclose_area, eps)
+ gious = ious - (enclose_area - union) / enclose_area
+ return gious
diff --git a/mmcv/core/bbox/match_costs/__init__.py b/mmcv/core/bbox/match_costs/__init__.py
new file mode 100644
index 0000000..8fdb6d2
--- /dev/null
+++ b/mmcv/core/bbox/match_costs/__init__.py
@@ -0,0 +1,7 @@
+from .builder import build_match_cost
+from .match_cost import BBoxL1Cost, ClassificationCost, FocalLossCost, IoUCost, BBox3DL1Cost, DiceCost
+
+__all__ = [
+ 'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
+ 'FocalLossCost', 'BBox3DL1Cost', 'DiceCost'
+]
diff --git a/mmcv/core/bbox/match_costs/builder.py b/mmcv/core/bbox/match_costs/builder.py
new file mode 100644
index 0000000..6894017
--- /dev/null
+++ b/mmcv/core/bbox/match_costs/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry, build_from_cfg
+
+MATCH_COST = Registry('Match Cost')
+
+
+def build_match_cost(cfg, default_args=None):
+ """Builder of IoU calculator."""
+ return build_from_cfg(cfg, MATCH_COST, default_args)
diff --git a/mmcv/core/bbox/match_costs/match_cost.py b/mmcv/core/bbox/match_costs/match_cost.py
new file mode 100644
index 0000000..b5a6a68
--- /dev/null
+++ b/mmcv/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,324 @@
+import torch
+import torch.nn.functional as F
+from mmcv.core.bbox.iou_calculators import bbox_overlaps
+from mmcv.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from .builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBoxL1Cost:
+ """BBoxL1Cost.
+
+ Args:
+ weight (int | float, optional): loss_weight
+ box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
+
+ Examples:
+ >>> from mmcv.core.bbox.match_costs.match_cost import BBoxL1Cost
+ >>> import torch
+ >>> self = BBoxL1Cost()
+ >>> bbox_pred = torch.rand(1, 4)
+ >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+ >>> factor = torch.tensor([10, 8, 10, 8])
+ >>> self(bbox_pred, gt_bboxes, factor)
+ tensor([[1.6172, 1.6422]])
+ """
+
+ def __init__(self, weight=1., box_format='xyxy'):
+ self.weight = weight
+ assert box_format in ['xyxy', 'xywh']
+ self.box_format = box_format
+
+ def __call__(self, bbox_pred, gt_bboxes):
+ """
+ Args:
+ bbox_pred (Tensor): Predicted boxes with normalized coordinates
+ (cx, cy, w, h), which are all in range [0, 1]. Shape
+ [num_query, 4].
+ gt_bboxes (Tensor): Ground truth boxes with normalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+ Returns:
+ torch.Tensor: bbox_cost value with weight
+ """
+ if self.box_format == 'xywh':
+ gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+ elif self.box_format == 'xyxy':
+ bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+ bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+ return bbox_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class FocalLossCost:
+ """FocalLossCost.
+
+ Args:
+ weight (int | float, optional): loss_weight
+ alpha (int | float, optional): focal_loss alpha
+ gamma (int | float, optional): focal_loss gamma
+ eps (float, optional): default 1e-12
+
+ Examples:
+ >>> from mmcv.core.bbox.match_costs.match_cost import FocalLossCost
+ >>> import torch
+ >>> self = FocalLossCost()
+ >>> cls_pred = torch.rand(4, 3)
+ >>> gt_labels = torch.tensor([0, 1, 2])
+ >>> factor = torch.tensor([10, 8, 10, 8])
+ >>> self(cls_pred, gt_labels)
+ tensor([[-0.3236, -0.3364, -0.2699],
+ [-0.3439, -0.3209, -0.4807],
+ [-0.4099, -0.3795, -0.2929],
+ [-0.1950, -0.1207, -0.2626]])
+ """
+
+ def __init__(self, weight=1., alpha=0.25, gamma=2, eps=1e-12):
+ self.weight = weight
+ self.alpha = alpha
+ self.gamma = gamma
+ self.eps = eps
+
+ def __call__(self, cls_pred, gt_labels):
+ """
+ Args:
+ cls_pred (Tensor): Predicted classification logits, shape
+ [num_query, num_class].
+ gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+ Returns:
+ torch.Tensor: cls_cost value with weight
+ """
+ cls_pred = cls_pred.sigmoid()
+ neg_cost = -(1 - cls_pred + self.eps).log() * (
+ 1 - self.alpha) * cls_pred.pow(self.gamma)
+ pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+ 1 - cls_pred).pow(self.gamma)
+ cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+ return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class ClassificationCost:
+ """ClsSoftmaxCost.
+
+ Args:
+ weight (int | float, optional): loss_weight
+
+ Examples:
+ >>> from mmcv.core.bbox.match_costs.match_cost import \
+ ... ClassificationCost
+ >>> import torch
+ >>> self = ClassificationCost()
+ >>> cls_pred = torch.rand(4, 3)
+ >>> gt_labels = torch.tensor([0, 1, 2])
+ >>> factor = torch.tensor([10, 8, 10, 8])
+ >>> self(cls_pred, gt_labels)
+ tensor([[-0.3430, -0.3525, -0.3045],
+ [-0.3077, -0.2931, -0.3992],
+ [-0.3664, -0.3455, -0.2881],
+ [-0.3343, -0.2701, -0.3956]])
+ """
+
+ def __init__(self, weight=1.):
+ self.weight = weight
+
+ def __call__(self, cls_pred, gt_labels):
+ """
+ Args:
+ cls_pred (Tensor): Predicted classification logits, shape
+ [num_query, num_class].
+ gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+ Returns:
+ torch.Tensor: cls_cost value with weight
+ """
+ # Following the official DETR repo, contrary to the loss that
+ # NLL is used, we approximate it in 1 - cls_score[gt_label].
+ # The 1 is a constant that doesn't change the matching,
+ # so it can be omitted.
+ cls_score = cls_pred.softmax(-1)
+ cls_cost = -cls_score[:, gt_labels]
+ return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class IoUCost:
+ """IoUCost.
+
+ Args:
+ iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+ weight (int | float, optional): loss weight
+
+ Examples:
+ >>> from mmcv.core.bbox.match_costs.match_cost import IoUCost
+ >>> import torch
+ >>> self = IoUCost()
+ >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+ >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+ >>> self(bboxes, gt_bboxes)
+ tensor([[-0.1250, 0.1667],
+ [ 0.1667, -0.5000]])
+ """
+
+ def __init__(self, iou_mode='giou', weight=1.):
+ self.weight = weight
+ self.iou_mode = iou_mode
+
+ def __call__(self, bboxes, gt_bboxes):
+ """
+ Args:
+ bboxes (Tensor): Predicted boxes with unnormalized coordinates
+ (x1, y1, x2, y2). Shape [num_query, 4].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+ Returns:
+ torch.Tensor: iou_cost value with weight
+ """
+ # overlaps: [num_bboxes, num_gt]
+ overlaps = bbox_overlaps(
+ bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+ # The 1 is a constant that doesn't change the matching, so omitted.
+ iou_cost = -overlaps
+ return iou_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+ """BBox3DL1Cost.
+ Args:
+ weight (int | float, optional): loss_weight
+ """
+
+ def __init__(self, weight=1.):
+ self.weight = weight
+
+ def __call__(self, bbox_pred, gt_bboxes):
+ """
+ Args:
+ bbox_pred (Tensor): Predicted boxes with normalized coordinates
+ (cx, cy, w, h), which are all in range [0, 1]. Shape
+ [num_query, 4].
+ gt_bboxes (Tensor): Ground truth boxes with normalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+ Returns:
+ torch.Tensor: bbox_cost value with weight
+ """
+ bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+ return bbox_cost * self.weight
+
+#@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+ """Smooth L1 loss.
+ Args:
+ pred (torch.Tensor): The prediction.
+ target (torch.Tensor): The learning target of the prediction.
+ beta (float, optional): The threshold in the piecewise function.
+ Defaults to 1.0.
+ Returns:
+ torch.Tensor: Calculated loss
+ """
+ assert beta > 0
+ if target.numel() == 0:
+ return pred.sum() * 0
+
+ # assert pred.size() == target.size()
+ diff = torch.abs(pred - target)
+ loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+ diff - 0.5 * beta)
+ return loss.sum(-1)
+
+
+@MATCH_COST.register_module()
+class SmoothL1Cost(object):
+ """SmoothL1Cost.
+ Args:
+ weight (int | float, optional): loss weight
+
+ Examples:
+ >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+ >>> import torch
+ >>> self = IoUCost()
+ >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+ >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+ >>> self(bboxes, gt_bboxes)
+ tensor([[-0.1250, 0.1667],
+ [ 0.1667, -0.5000]])
+ """
+
+ def __init__(self, weight=1.):
+ self.weight = weight
+
+ def __call__(self, input, target):
+ """
+ Args:
+ bboxes (Tensor): Predicted boxes with unnormalized coordinates
+ (x1, y1, x2, y2). Shape [num_query, 4].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+ Returns:
+ torch.Tensor: iou_cost value with weight
+ """
+ N1, C = input.shape
+ N2, C = target.shape
+ input = input.contiguous().view(N1, C)[:, None, :]
+ target = target.contiguous().view(N2, C)[None, :, :]
+ cost = smooth_l1_loss(input, target)
+
+ return cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost(object):
+ """IoUCost.
+
+ Args:
+ iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+ weight (int | float, optional): loss weight
+
+ Examples:
+ >>> from mmcv.core.bbox.match_costs.match_cost import IoUCost
+ >>> import torch
+ >>> self = IoUCost()
+ >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+ >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+ >>> self(bboxes, gt_bboxes)
+ tensor([[-0.1250, 0.1667],
+ [ 0.1667, -0.5000]])
+ """
+
+ def __init__(self, weight=1.):
+ self.weight = weight
+ self.count = 0
+
+ def __call__(self, input, target):
+ """
+ Args:
+ bboxes (Tensor): Predicted boxes with unnormalized coordinates
+ (x1, y1, x2, y2). Shape [num_query, 4].
+ gt_bboxes (Tensor): Ground truth boxes with unnormalized
+ coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+ Returns:
+ torch.Tensor: iou_cost value with weight
+ """
+ # overlaps: [num_bboxes, num_gt]
+ # print('INPUT', input.shape)
+ # print('target',target.shape)
+
+ N1, H1, W1 = input.shape
+ N2, H2, W2 = target.shape
+
+ if H1 != H2 or W1 != W2:
+ target = F.interpolate(target.unsqueeze(0), size=(H1, W1), mode='bilinear').squeeze(0)
+
+ input = input.contiguous().view(N1, -1)[:, None, :]
+ target = target.contiguous().view(N2, -1)[None, :, :]
+
+ a = torch.sum(input * target, -1)
+ b = torch.sum(input * input, -1) + 0.001
+ c = torch.sum(target * target, -1) + 0.001
+ d = (2 * a) / (b + c)
+ return (1 - d) * self.weight
diff --git a/mmcv/core/bbox/samplers/__init__.py b/mmcv/core/bbox/samplers/__init__.py
new file mode 100644
index 0000000..3a743dc
--- /dev/null
+++ b/mmcv/core/bbox/samplers/__init__.py
@@ -0,0 +1,6 @@
+from .pseudo_sampler import PseudoSampler
+
+
+__all__ = [
+ 'PseudoSampler'
+]
diff --git a/mmcv/core/bbox/samplers/base_sampler.py b/mmcv/core/bbox/samplers/base_sampler.py
new file mode 100644
index 0000000..1534082
--- /dev/null
+++ b/mmcv/core/bbox/samplers/base_sampler.py
@@ -0,0 +1,101 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+ """Base class of samplers."""
+
+ def __init__(self,
+ num,
+ pos_fraction,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True,
+ **kwargs):
+ self.num = num
+ self.pos_fraction = pos_fraction
+ self.neg_pos_ub = neg_pos_ub
+ self.add_gt_as_proposals = add_gt_as_proposals
+ self.pos_sampler = self
+ self.neg_sampler = self
+
+ @abstractmethod
+ def _sample_pos(self, assign_result, num_expected, **kwargs):
+ """Sample positive samples."""
+ pass
+
+ @abstractmethod
+ def _sample_neg(self, assign_result, num_expected, **kwargs):
+ """Sample negative samples."""
+ pass
+
+ def sample(self,
+ assign_result,
+ bboxes,
+ gt_bboxes,
+ gt_labels=None,
+ **kwargs):
+ """Sample positive and negative bboxes.
+
+ This is a simple implementation of bbox sampling given candidates,
+ assigning results and ground truth bboxes.
+
+ Args:
+ assign_result (:obj:`AssignResult`): Bbox assigning results.
+ bboxes (Tensor): Boxes to be sampled from.
+ gt_bboxes (Tensor): Ground truth bboxes.
+ gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+ Returns:
+ :obj:`SamplingResult`: Sampling result.
+
+ Example:
+ >>> from mmcv.core.bbox import RandomSampler
+ >>> from mmcv.core.bbox import AssignResult
+ >>> from mmcv.core.bbox.demodata import ensure_rng, random_boxes
+ >>> rng = ensure_rng(None)
+ >>> assign_result = AssignResult.random(rng=rng)
+ >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
+ >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+ >>> gt_labels = None
+ >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+ >>> add_gt_as_proposals=False)
+ >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+ """
+ if len(bboxes.shape) < 2:
+ bboxes = bboxes[None, :]
+
+ bboxes = bboxes[:, :4]
+
+ gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+ if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+ if gt_labels is None:
+ raise ValueError(
+ 'gt_labels must be given when add_gt_as_proposals is True')
+ bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+ assign_result.add_gt_(gt_labels)
+ gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+ gt_flags = torch.cat([gt_ones, gt_flags])
+
+ num_expected_pos = int(self.num * self.pos_fraction)
+ pos_inds = self.pos_sampler._sample_pos(
+ assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+ # We found that sampled indices have duplicated items occasionally.
+ # (may be a bug of PyTorch)
+ pos_inds = pos_inds.unique()
+ num_sampled_pos = pos_inds.numel()
+ num_expected_neg = self.num - num_sampled_pos
+ if self.neg_pos_ub >= 0:
+ _pos = max(1, num_sampled_pos)
+ neg_upper_bound = int(self.neg_pos_ub * _pos)
+ if num_expected_neg > neg_upper_bound:
+ num_expected_neg = neg_upper_bound
+ neg_inds = self.neg_sampler._sample_neg(
+ assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+ neg_inds = neg_inds.unique()
+
+ sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+ assign_result, gt_flags)
+ return sampling_result
diff --git a/mmcv/core/bbox/samplers/pseudo_sampler.py b/mmcv/core/bbox/samplers/pseudo_sampler.py
new file mode 100644
index 0000000..2bd81ab
--- /dev/null
+++ b/mmcv/core/bbox/samplers/pseudo_sampler.py
@@ -0,0 +1,41 @@
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class PseudoSampler(BaseSampler):
+ """A pseudo sampler that does not do sampling actually."""
+
+ def __init__(self, **kwargs):
+ pass
+
+ def _sample_pos(self, **kwargs):
+ """Sample positive samples."""
+ raise NotImplementedError
+
+ def _sample_neg(self, **kwargs):
+ """Sample negative samples."""
+ raise NotImplementedError
+
+ def sample(self, assign_result, bboxes, gt_bboxes, **kwargs):
+ """Directly returns the positive and negative indices of samples.
+
+ Args:
+ assign_result (:obj:`AssignResult`): Assigned results
+ bboxes (torch.Tensor): Bounding boxes
+ gt_bboxes (torch.Tensor): Ground truth boxes
+
+ Returns:
+ :obj:`SamplingResult`: sampler results
+ """
+ pos_inds = torch.nonzero(
+ assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+ neg_inds = torch.nonzero(
+ assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+ gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+ sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+ assign_result, gt_flags)
+ return sampling_result
diff --git a/mmcv/core/bbox/samplers/sampling_result.py b/mmcv/core/bbox/samplers/sampling_result.py
new file mode 100644
index 0000000..06eff7e
--- /dev/null
+++ b/mmcv/core/bbox/samplers/sampling_result.py
@@ -0,0 +1,152 @@
+import torch
+
+from mmcv.utils import util_mixins
+
+
+class SamplingResult(util_mixins.NiceRepr):
+ """Bbox sampling result.
+
+ Example:
+ >>> # xdoctest: +IGNORE_WANT
+ >>> from mmcv.core.bbox.samplers.sampling_result import * # NOQA
+ >>> self = SamplingResult.random(rng=10)
+ >>> print(f'self = {self}')
+ self =
+ """
+
+ def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+ gt_flags):
+ self.pos_inds = pos_inds
+ self.neg_inds = neg_inds
+ self.pos_bboxes = bboxes[pos_inds]
+ self.neg_bboxes = bboxes[neg_inds]
+ self.pos_is_gt = gt_flags[pos_inds]
+
+ self.num_gts = gt_bboxes.shape[0]
+ self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+ if gt_bboxes.numel() == 0:
+ # hack for index error case
+ assert self.pos_assigned_gt_inds.numel() == 0
+ self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+ else:
+ if len(gt_bboxes.shape) < 2:
+ gt_bboxes = gt_bboxes.view(-1, 4)
+
+ self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
+
+ if assign_result.labels is not None:
+ self.pos_gt_labels = assign_result.labels[pos_inds]
+ else:
+ self.pos_gt_labels = None
+
+ @property
+ def bboxes(self):
+ """torch.Tensor: concatenated positive and negative boxes"""
+ return torch.cat([self.pos_bboxes, self.neg_bboxes])
+
+ def to(self, device):
+ """Change the device of the data inplace.
+
+ Example:
+ >>> self = SamplingResult.random()
+ >>> print(f'self = {self.to(None)}')
+ >>> # xdoctest: +REQUIRES(--gpu)
+ >>> print(f'self = {self.to(0)}')
+ """
+ _dict = self.__dict__
+ for key, value in _dict.items():
+ if isinstance(value, torch.Tensor):
+ _dict[key] = value.to(device)
+ return self
+
+ def __nice__(self):
+ data = self.info.copy()
+ data['pos_bboxes'] = data.pop('pos_bboxes').shape
+ data['neg_bboxes'] = data.pop('neg_bboxes').shape
+ parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+ body = ' ' + ',\n '.join(parts)
+ return '{\n' + body + '\n}'
+
+ @property
+ def info(self):
+ """Returns a dictionary of info about the object."""
+ return {
+ 'pos_inds': self.pos_inds,
+ 'neg_inds': self.neg_inds,
+ 'pos_bboxes': self.pos_bboxes,
+ 'neg_bboxes': self.neg_bboxes,
+ 'pos_is_gt': self.pos_is_gt,
+ 'num_gts': self.num_gts,
+ 'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+ }
+
+ @classmethod
+ def random(cls, rng=None, **kwargs):
+ """
+ Args:
+ rng (None | int | numpy.random.RandomState): seed or state.
+ kwargs (keyword arguments):
+ - num_preds: number of predicted boxes
+ - num_gts: number of true boxes
+ - p_ignore (float): probability of a predicted box assigned to \
+ an ignored truth.
+ - p_assigned (float): probability of a predicted box not being \
+ assigned.
+ - p_use_label (float | bool): with labels or not.
+
+ Returns:
+ :obj:`SamplingResult`: Randomly generated sampling result.
+
+ Example:
+ >>> from mmcv.core.bbox.samplers.sampling_result import * # NOQA
+ >>> self = SamplingResult.random()
+ >>> print(self.__dict__)
+ """
+ from mmcv.core.bbox.samplers.random_sampler import RandomSampler
+ from mmcv.core.bbox.assigners.assign_result import AssignResult
+ from mmcv.core.bbox import demodata
+ rng = demodata.ensure_rng(rng)
+
+ # make probabalistic?
+ num = 32
+ pos_fraction = 0.5
+ neg_pos_ub = -1
+
+ assign_result = AssignResult.random(rng=rng, **kwargs)
+
+ # Note we could just compute an assignment
+ bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+ gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+ if rng.rand() > 0.2:
+ # sometimes algorithms squeeze their data, be robust to that
+ gt_bboxes = gt_bboxes.squeeze()
+ bboxes = bboxes.squeeze()
+
+ if assign_result.labels is None:
+ gt_labels = None
+ else:
+ gt_labels = None # todo
+
+ if gt_labels is None:
+ add_gt_as_proposals = False
+ else:
+ add_gt_as_proposals = True # make probabalistic?
+
+ sampler = RandomSampler(
+ num,
+ pos_fraction,
+ neg_pos_ub=neg_pos_ub,
+ add_gt_as_proposals=add_gt_as_proposals,
+ rng=rng)
+ self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+ return self
diff --git a/mmcv/core/bbox/structures/__init__.py b/mmcv/core/bbox/structures/__init__.py
new file mode 100644
index 0000000..7e55840
--- /dev/null
+++ b/mmcv/core/bbox/structures/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period,
+ mono_cam_box2vis, points_cam2img, rotation_3d_in_axis,
+ xywhr2xyxyr)
diff --git a/mmcv/core/bbox/structures/base_box3d.py b/mmcv/core/bbox/structures/base_box3d.py
new file mode 100644
index 0000000..a09caf4
--- /dev/null
+++ b/mmcv/core/bbox/structures/base_box3d.py
@@ -0,0 +1,462 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from abc import abstractmethod
+
+# from mmcv.ops.iou3d import iou3d_cuda
+from .utils import limit_period, xywhr2xyxyr
+from mmcv.ops.iou3d_det import iou3d_cuda
+
+
+class BaseInstance3DBoxes(object):
+ """Base class for 3D Boxes.
+
+ Note:
+ The box is bottom centered, i.e. the relative position of origin in
+ the box is (0.5, 0.5, 0).
+
+ Args:
+ tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix.
+ box_dim (int): Number of the dimension of a box.
+ Each row is (x, y, z, x_size, y_size, z_size, yaw).
+ Default to 7.
+ with_yaw (bool): Whether the box is with yaw rotation.
+ If False, the value of yaw will be set to 0 as minmax boxes.
+ Default to True.
+ origin (tuple[float]): The relative position of origin in the box.
+ Default to (0.5, 0.5, 0). This will guide the box be converted to
+ (0.5, 0.5, 0) mode.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x box_dim.
+ box_dim (int): Integer indicating the dimension of a box.
+ Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+ with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+ boxes.
+ """
+
+ def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
+ if isinstance(tensor, torch.Tensor):
+ device = tensor.device
+ else:
+ device = torch.device('cpu')
+ tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+ if tensor.numel() == 0:
+ # Use reshape, so we don't end up creating a new tensor that
+ # does not depend on the inputs (and consequently confuses jit)
+ tensor = tensor.reshape((0, box_dim)).to(
+ dtype=torch.float32, device=device)
+ assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+ if tensor.shape[-1] == 6:
+ # If the dimension of boxes is 6, we expand box_dim by padding
+ # 0 as a fake yaw and set with_yaw to False.
+ assert box_dim == 6
+ fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+ tensor = torch.cat((tensor, fake_rot), dim=-1)
+ self.box_dim = box_dim + 1
+ self.with_yaw = False
+ else:
+ self.box_dim = box_dim
+ self.with_yaw = with_yaw
+ self.tensor = tensor.clone()
+
+ if origin != (0.5, 0.5, 0):
+ dst = self.tensor.new_tensor((0.5, 0.5, 0))
+ src = self.tensor.new_tensor(origin)
+ self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+ @property
+ def volume(self):
+ """torch.Tensor: A vector with volume of each box."""
+ return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
+
+ @property
+ def dims(self):
+ """torch.Tensor: Corners of each box with size (N, 8, 3)."""
+ return self.tensor[:, 3:6]
+
+ @property
+ def yaw(self):
+ """torch.Tensor: A vector with yaw of each box."""
+ return self.tensor[:, 6]
+
+ @property
+ def height(self):
+ """torch.Tensor: A vector with height of each box."""
+ return self.tensor[:, 5]
+
+ @property
+ def top_height(self):
+ """torch.Tensor: A vector with the top height of each box."""
+ return self.bottom_height + self.height
+
+ @property
+ def bottom_height(self):
+ """torch.Tensor: A vector with bottom's height of each box."""
+ return self.tensor[:, 2]
+
+ @property
+ def center(self):
+ """Calculate the center of all the boxes.
+
+ Note:
+ In the MMDetection3D's convention, the bottom center is
+ usually taken as the default center.
+
+ The relative position of the centers in different kinds of
+ boxes are different, e.g., the relative center of a boxes is
+ (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
+ It is recommended to use ``bottom_center`` or ``gravity_center``
+ for more clear usage.
+
+ Returns:
+ torch.Tensor: A tensor with center of each box.
+ """
+ return self.bottom_center
+
+ @property
+ def bottom_center(self):
+ """torch.Tensor: A tensor with center of each box."""
+ return self.tensor[:, :3]
+
+ @property
+ def gravity_center(self):
+ """torch.Tensor: A tensor with center of each box."""
+ pass
+
+ @property
+ def corners(self):
+ """torch.Tensor: a tensor with 8 corners of each box."""
+ pass
+
+ @abstractmethod
+ def rotate(self, angle, points=None):
+ """Rotate boxes with points (optional) with the given angle or \
+ rotation matrix.
+
+ Args:
+ angle (float | torch.Tensor | np.ndarray):
+ Rotation angle or rotation matrix.
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+ Points to rotate. Defaults to None.
+ """
+ pass
+
+ @abstractmethod
+ def flip(self, bev_direction='horizontal'):
+ """Flip the boxes in BEV along given BEV direction."""
+ pass
+
+ def translate(self, trans_vector):
+ """Translate boxes with the given translation vector.
+
+ Args:
+ trans_vector (torch.Tensor): Translation vector of size 1x3.
+ """
+ if not isinstance(trans_vector, torch.Tensor):
+ trans_vector = self.tensor.new_tensor(trans_vector)
+ self.tensor[:, :3] += trans_vector
+
+ def in_range_3d(self, box_range):
+ """Check whether the boxes are in the given range.
+
+ Args:
+ box_range (list | torch.Tensor): The range of box
+ (x_min, y_min, z_min, x_max, y_max, z_max)
+
+ Note:
+ In the original implementation of SECOND, checking whether
+ a box in the range checks whether the points are in a convex
+ polygon, we try to reduce the burden for simpler cases.
+
+ Returns:
+ torch.Tensor: A binary vector indicating whether each box is \
+ inside the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > box_range[0])
+ & (self.tensor[:, 1] > box_range[1])
+ & (self.tensor[:, 2] > box_range[2])
+ & (self.tensor[:, 0] < box_range[3])
+ & (self.tensor[:, 1] < box_range[4])
+ & (self.tensor[:, 2] < box_range[5]))
+ return in_range_flags
+
+ @abstractmethod
+ def in_range_bev(self, box_range):
+ """Check whether the boxes are in the given range.
+
+ Args:
+ box_range (list | torch.Tensor): The range of box
+ in order of (x_min, y_min, x_max, y_max).
+
+ Returns:
+ torch.Tensor: Indicating whether each box is inside \
+ the reference range.
+ """
+ pass
+
+ @abstractmethod
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`Box3DMode`): The target Box mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: The converted box of the same type \
+ in the `dst` mode.
+ """
+ pass
+
+ def scale(self, scale_factor):
+ """Scale the box with horizontal and vertical scaling factors.
+
+ Args:
+ scale_factors (float): Scale factors to scale the boxes.
+ """
+ self.tensor[:, :6] *= scale_factor
+ self.tensor[:, 7:] *= scale_factor
+
+ def limit_yaw(self, offset=0.5, period=np.pi):
+ """Limit the yaw to a given period and offset.
+
+ Args:
+ offset (float): The offset of the yaw.
+ period (float): The expected period.
+ """
+ self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
+
+ def nonempty(self, threshold: float = 0.0):
+ """Find boxes that are non-empty.
+
+ A box is considered empty,
+ if either of its side is no larger than threshold.
+
+ Args:
+ threshold (float): The threshold of minimal sizes.
+
+ Returns:
+ torch.Tensor: A binary vector which represents whether each \
+ box is empty (False) or non-empty (True).
+ """
+ box = self.tensor
+ size_x = box[..., 3]
+ size_y = box[..., 4]
+ size_z = box[..., 5]
+ keep = ((size_x > threshold)
+ & (size_y > threshold) & (size_z > threshold))
+ return keep
+
+ def __getitem__(self, item):
+ """
+ Note:
+ The following usage are allowed:
+ 1. `new_boxes = boxes[3]`:
+ return a `Boxes` that contains only one box.
+ 2. `new_boxes = boxes[2:10]`:
+ return a slice of boxes.
+ 3. `new_boxes = boxes[vector]`:
+ where vector is a torch.BoolTensor with `length = len(boxes)`.
+ Nonzero elements in the vector will be selected.
+ Note that the returned Boxes might share storage with this Boxes,
+ subject to Pytorch's indexing semantics.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: A new object of \
+ :class:`BaseInstances3DBoxes` after indexing.
+ """
+ original_type = type(self)
+ if isinstance(item, int):
+ return original_type(
+ self.tensor[item].view(1, -1),
+ box_dim=self.box_dim,
+ with_yaw=self.with_yaw)
+ b = self.tensor[item]
+ assert b.dim() == 2, \
+ f'Indexing on Boxes with {item} failed to return a matrix!'
+ return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+ def __len__(self):
+ """int: Number of boxes in the current object."""
+ return self.tensor.shape[0]
+
+ def __repr__(self):
+ """str: Return a strings that describes the object."""
+ return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')'
+
+ @classmethod
+ def cat(cls, boxes_list):
+ """Concatenate a list of Boxes into a single Boxes.
+
+ Args:
+ boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: The concatenated Boxes.
+ """
+ assert isinstance(boxes_list, (list, tuple))
+ if len(boxes_list) == 0:
+ return cls(torch.empty(0))
+ assert all(isinstance(box, cls) for box in boxes_list)
+
+ # use torch.cat (v.s. layers.cat)
+ # so the returned boxes never share storage with input
+ cat_boxes = cls(
+ torch.cat([b.tensor for b in boxes_list], dim=0),
+ box_dim=boxes_list[0].tensor.shape[1],
+ with_yaw=boxes_list[0].with_yaw)
+ return cat_boxes
+
+ def to(self, device):
+ """Convert current boxes to a specific device.
+
+ Args:
+ device (str | :obj:`torch.device`): The name of the device.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: A new boxes object on the \
+ specific device.
+ """
+ original_type = type(self)
+ return original_type(
+ self.tensor.to(device),
+ box_dim=self.box_dim,
+ with_yaw=self.with_yaw)
+
+ def clone(self):
+ """Clone the Boxes.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: Box object with the same properties \
+ as self.
+ """
+ original_type = type(self)
+ return original_type(
+ self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+ @property
+ def device(self):
+ """str: The device of the boxes are on."""
+ return self.tensor.device
+
+ def __iter__(self):
+ """Yield a box as a Tensor of shape (4,) at a time.
+
+ Returns:
+ torch.Tensor: A box of shape (4,).
+ """
+ yield from self.tensor
+
+ @classmethod
+ def height_overlaps(cls, boxes1, boxes2, mode='iou'):
+ """Calculate height overlaps of two boxes.
+
+ Note:
+ This function calculates the height overlaps between boxes1 and
+ boxes2, boxes1 and boxes2 should be in the same type.
+
+ Args:
+ boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+ boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+ mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+ Returns:
+ torch.Tensor: Calculated iou of boxes.
+ """
+ assert isinstance(boxes1, BaseInstance3DBoxes)
+ assert isinstance(boxes2, BaseInstance3DBoxes)
+ assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+ boxes1_top_height = boxes1.top_height.view(-1, 1)
+ boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+ boxes2_top_height = boxes2.top_height.view(1, -1)
+ boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+ heighest_of_bottom = torch.max(boxes1_bottom_height,
+ boxes2_bottom_height)
+ lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
+ overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
+ return overlaps_h
+
+ @classmethod
+ def overlaps(cls, boxes1, boxes2, mode='iou'):
+ """Calculate 3D overlaps of two boxes.
+
+ Note:
+ This function calculates the overlaps between ``boxes1`` and
+ ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+ Args:
+ boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+ boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+ mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+ Returns:
+ torch.Tensor: Calculated iou of boxes' heights.
+ """
+ assert isinstance(boxes1, BaseInstance3DBoxes)
+ assert isinstance(boxes2, BaseInstance3DBoxes)
+ assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+ assert mode in ['iou', 'iof']
+
+ rows = len(boxes1)
+ cols = len(boxes2)
+ if rows * cols == 0:
+ return boxes1.tensor.new(rows, cols)
+
+ # height overlap
+ overlaps_h = cls.height_overlaps(boxes1, boxes2)
+
+ # obtain BEV boxes in XYXYR format
+ boxes1_bev = xywhr2xyxyr(boxes1.bev)
+ boxes2_bev = xywhr2xyxyr(boxes2.bev)
+
+ # bev overlap
+ overlaps_bev = boxes1_bev.new_zeros(
+ (boxes1_bev.shape[0], boxes2_bev.shape[0])).cuda() # (N, M)
+ iou3d_cuda.boxes_overlap_bev_gpu(boxes1_bev.contiguous().cuda(),
+ boxes2_bev.contiguous().cuda(),
+ overlaps_bev)
+
+ # 3d overlaps
+ overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
+
+ volume1 = boxes1.volume.view(-1, 1)
+ volume2 = boxes2.volume.view(1, -1)
+
+ if mode == 'iou':
+ # the clamp func is used to avoid division of 0
+ iou3d = overlaps_3d / torch.clamp(
+ volume1 + volume2 - overlaps_3d, min=1e-8)
+ else:
+ iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
+
+ return iou3d
+
+ def new_box(self, data):
+ """Create a new box object with data.
+
+ The new box and its tensor has the similar properties \
+ as self and self.tensor, respectively.
+
+ Args:
+ data (torch.Tensor | numpy.array | list): Data to be copied.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, \
+ the object's other properties are similar to ``self``.
+ """
+ new_tensor = self.tensor.new_tensor(data) \
+ if not isinstance(data, torch.Tensor) else data.to(self.device)
+ original_type = type(self)
+ return original_type(
+ new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
diff --git a/mmcv/core/bbox/structures/box_3d_mode.py b/mmcv/core/bbox/structures/box_3d_mode.py
new file mode 100644
index 0000000..6e2db4f
--- /dev/null
+++ b/mmcv/core/bbox/structures/box_3d_mode.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from enum import IntEnum, unique
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+
+
+@unique
+class Box3DMode(IntEnum):
+ r"""Enum of different ways to represent a box.
+
+ Coordinates in LiDAR:
+
+ .. code-block:: none
+
+ up z
+ ^ x front
+ | /
+ | /
+ left y <------ 0
+
+ The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+ and the yaw is around the z axis, thus the rotation axis=2.
+
+ Coordinates in camera:
+
+ .. code-block:: none
+
+ z front
+ /
+ /
+ 0 ------> x right
+ |
+ |
+ v
+ down y
+
+ The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+ and the yaw is around the y axis, thus the rotation axis=1.
+
+ Coordinates in Depth mode:
+
+ .. code-block:: none
+
+ up z
+ ^ y front
+ | /
+ | /
+ 0 ------> x right
+
+ The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+ and the yaw is around the z axis, thus the rotation axis=2.
+ """
+
+ LIDAR = 0
+ CAM = 1
+ DEPTH = 2
+
+ @staticmethod
+ def convert(box, src, dst, rt_mat=None):
+ """Convert boxes from `src` mode to `dst` mode.
+
+ Args:
+ box (tuple | list | np.ndarray |
+ torch.Tensor | BaseInstance3DBoxes):
+ Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+ src (:obj:`Box3DMode`): The src Box mode.
+ dst (:obj:`Box3DMode`): The target Box mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \
+ The converted box of the same type.
+ """
+ if src == dst:
+ return box
+
+ is_numpy = isinstance(box, np.ndarray)
+ is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+ single_box = isinstance(box, (list, tuple))
+ if single_box:
+ assert len(box) >= 7, (
+ 'Box3DMode.convert takes either a k-tuple/list or '
+ 'an Nxk array/tensor, where k >= 7')
+ arr = torch.tensor(box)[None, :]
+ else:
+ # avoid modifying the input box
+ if is_numpy:
+ arr = torch.from_numpy(np.asarray(box)).clone()
+ elif is_Instance3DBoxes:
+ arr = box.tensor.clone()
+ else:
+ arr = box.clone()
+
+ # convert box from `src` mode to `dst` mode.
+ x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+ if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+ xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
+ elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+ xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
+ elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+ xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+ elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+ xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+ elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+ xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+ elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+ xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+ else:
+ raise NotImplementedError(
+ f'Conversion from Box3DMode {src} to {dst} '
+ 'is not supported yet')
+
+ if not isinstance(rt_mat, torch.Tensor):
+ rt_mat = arr.new_tensor(rt_mat)
+ if rt_mat.size(1) == 4:
+ extended_xyz = torch.cat(
+ [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+ xyz = extended_xyz @ rt_mat.t()
+ else:
+ xyz = arr[:, :3] @ rt_mat.t()
+
+ remains = arr[..., 6:]
+ arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
+
+ # convert arr to the original type
+ original_type = type(box)
+ if single_box:
+ return original_type(arr.flatten().tolist())
+ if is_numpy:
+ return arr.numpy()
+ elif is_Instance3DBoxes:
+ if dst == Box3DMode.CAM:
+ target_type = CameraInstance3DBoxes
+ elif dst == Box3DMode.LIDAR:
+ target_type = LiDARInstance3DBoxes
+ elif dst == Box3DMode.DEPTH:
+ target_type = DepthInstance3DBoxes
+ else:
+ raise NotImplementedError(
+ f'Conversion to {dst} through {original_type}'
+ ' is not supported yet')
+ return target_type(
+ arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
+ else:
+ return arr
diff --git a/mmcv/core/bbox/structures/cam_box3d.py b/mmcv/core/bbox/structures/cam_box3d.py
new file mode 100644
index 0000000..2f0a74b
--- /dev/null
+++ b/mmcv/core/bbox/structures/cam_box3d.py
@@ -0,0 +1,324 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class CameraInstance3DBoxes(BaseInstance3DBoxes):
+ """3D boxes of instances in CAM coordinates.
+
+ Coordinates in camera:
+
+ .. code-block:: none
+
+ z front (yaw=-0.5*pi)
+ /
+ /
+ 0 ------> x right (yaw=0)
+ |
+ |
+ v
+ down y
+
+ The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+ and the yaw is around the y axis, thus the rotation axis=1.
+ The yaw is 0 at the positive direction of x axis, and decreases from
+ the positive direction of x to the positive direction of z.
+
+ A refactor is ongoing to make the three coordinate systems
+ easier to understand and convert between each other.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x box_dim.
+ box_dim (int): Integer indicates the dimension of a box
+ Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+ with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+ boxes.
+ """
+
+ def __init__(self,
+ tensor,
+ box_dim=7,
+ with_yaw=True,
+ origin=(0.5, 1.0, 0.5)):
+ if isinstance(tensor, torch.Tensor):
+ device = tensor.device
+ else:
+ device = torch.device('cpu')
+ tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+ if tensor.numel() == 0:
+ # Use reshape, so we don't end up creating a new tensor that
+ # does not depend on the inputs (and consequently confuses jit)
+ tensor = tensor.reshape((0, box_dim)).to(
+ dtype=torch.float32, device=device)
+ assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+ if tensor.shape[-1] == 6:
+ # If the dimension of boxes is 6, we expand box_dim by padding
+ # 0 as a fake yaw and set with_yaw to False.
+ assert box_dim == 6
+ fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+ tensor = torch.cat((tensor, fake_rot), dim=-1)
+ self.box_dim = box_dim + 1
+ self.with_yaw = False
+ else:
+ self.box_dim = box_dim
+ self.with_yaw = with_yaw
+ self.tensor = tensor.clone()
+
+ if origin != (0.5, 1.0, 0.5):
+ dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
+ src = self.tensor.new_tensor(origin)
+ self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+ @property
+ def height(self):
+ """torch.Tensor: A vector with height of each box."""
+ return self.tensor[:, 4]
+
+ @property
+ def top_height(self):
+ """torch.Tensor: A vector with the top height of each box."""
+ # the positive direction is down rather than up
+ return self.bottom_height - self.height
+
+ @property
+ def bottom_height(self):
+ """torch.Tensor: A vector with bottom's height of each box."""
+ return self.tensor[:, 1]
+
+ @property
+ def gravity_center(self):
+ """torch.Tensor: A tensor with center of each box."""
+ bottom_center = self.bottom_center
+ gravity_center = torch.zeros_like(bottom_center)
+ gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
+ gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
+ return gravity_center
+
+ @property
+ def corners(self):
+ """torch.Tensor: Coordinates of corners of all the boxes in
+ shape (N, 8, 3).
+
+ Convert the boxes to in clockwise order, in the form of
+ (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
+
+ .. code-block:: none
+
+ front z
+ /
+ /
+ (x0, y0, z1) + ----------- + (x1, y0, z1)
+ /| / |
+ / | / |
+ (x0, y0, z0) + ----------- + + (x1, y1, z1)
+ | / . | /
+ | / origin | /
+ (x0, y1, z0) + ----------- + -------> x right
+ | (x1, y1, z0)
+ |
+ v
+ down y
+ """
+ # TODO: rotation_3d_in_axis function do not support
+ # empty tensor currently.
+ assert len(self.tensor) != 0
+ dims = self.dims
+ corners_norm = torch.from_numpy(
+ np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+ device=dims.device, dtype=dims.dtype)
+
+ corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+ # use relative origin [0.5, 1, 0.5]
+ corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
+ corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+ # rotate around y axis
+ corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1)
+ corners += self.tensor[:, :3].view(-1, 1, 3)
+ return corners
+
+ @property
+ def bev(self):
+ """torch.Tensor: A n x 5 tensor of 2D BEV box of each box
+ with rotation in XYWHR format."""
+ return self.tensor[:, [0, 2, 3, 5, 6]]
+
+ @property
+ def nearest_bev(self):
+ """torch.Tensor: A tensor of 2D BEV box of each box
+ without rotation."""
+ # Obtain BEV boxes with rotation in XZWHR format
+ bev_rotated_boxes = self.bev
+ # convert the rotation to a valid range
+ rotations = bev_rotated_boxes[:, -1]
+ normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+ # find the center of boxes
+ conditions = (normed_rotations > np.pi / 4)[..., None]
+ bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+ [0, 1, 3, 2]],
+ bev_rotated_boxes[:, :4])
+
+ centers = bboxes_xywh[:, :2]
+ dims = bboxes_xywh[:, 2:]
+ bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+ return bev_boxes
+
+ def rotate(self, angle, points=None):
+ """Rotate boxes with points (optional) with the given angle or \
+ rotation matrix.
+
+ Args:
+ angle (float | torch.Tensor | np.ndarray):
+ Rotation angle or rotation matrix.
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+ Points to rotate. Defaults to None.
+
+ Returns:
+ tuple or None: When ``points`` is None, the function returns \
+ None, otherwise it returns the rotated points and the \
+ rotation matrix ``rot_mat_T``.
+ """
+ if not isinstance(angle, torch.Tensor):
+ angle = self.tensor.new_tensor(angle)
+ assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+ f'invalid rotation angle shape {angle.shape}'
+
+ if angle.numel() == 1:
+ rot_sin = torch.sin(angle)
+ rot_cos = torch.cos(angle)
+ rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin],
+ [0, 1, 0],
+ [rot_sin, 0, rot_cos]])
+ else:
+ rot_mat_T = angle
+ rot_sin = rot_mat_T[2, 0]
+ rot_cos = rot_mat_T[0, 0]
+ angle = np.arctan2(rot_sin, rot_cos)
+
+ self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+ self.tensor[:, 6] += angle
+
+ if points is not None:
+ if isinstance(points, torch.Tensor):
+ points[:, :3] = points[:, :3] @ rot_mat_T
+ elif isinstance(points, np.ndarray):
+ rot_mat_T = rot_mat_T.numpy()
+ points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+ elif isinstance(points, BasePoints):
+ # clockwise
+ points.rotate(-angle)
+ else:
+ raise ValueError
+ return points, rot_mat_T
+
+ def flip(self, bev_direction='horizontal', points=None):
+ """Flip the boxes in BEV along given BEV direction.
+
+ In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
+
+ Args:
+ bev_direction (str): Flip direction (horizontal or vertical).
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+ Points to flip. Defaults to None.
+
+ Returns:
+ torch.Tensor, numpy.ndarray or None: Flipped points.
+ """
+ assert bev_direction in ('horizontal', 'vertical')
+ if bev_direction == 'horizontal':
+ self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+ if self.with_yaw:
+ self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+ elif bev_direction == 'vertical':
+ self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+ if self.with_yaw:
+ self.tensor[:, 6] = -self.tensor[:, 6]
+
+ if points is not None:
+ assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+ if isinstance(points, (torch.Tensor, np.ndarray)):
+ if bev_direction == 'horizontal':
+ points[:, 0] = -points[:, 0]
+ elif bev_direction == 'vertical':
+ points[:, 2] = -points[:, 2]
+ elif isinstance(points, BasePoints):
+ points.flip(bev_direction)
+ return points
+
+ def in_range_bev(self, box_range):
+ """Check whether the boxes are in the given range.
+
+ Args:
+ box_range (list | torch.Tensor): The range of box
+ (x_min, z_min, x_max, z_max).
+
+ Note:
+ The original implementation of SECOND checks whether boxes in
+ a range by checking whether the points are in a convex
+ polygon, we reduce the burden for simpler cases.
+
+ Returns:
+ torch.Tensor: Indicating whether each box is inside \
+ the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > box_range[0])
+ & (self.tensor[:, 2] > box_range[1])
+ & (self.tensor[:, 0] < box_range[2])
+ & (self.tensor[:, 2] < box_range[3]))
+ return in_range_flags
+
+ @classmethod
+ def height_overlaps(cls, boxes1, boxes2, mode='iou'):
+ """Calculate height overlaps of two boxes.
+
+ This function calculates the height overlaps between ``boxes1`` and
+ ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type.
+
+ Args:
+ boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
+ boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
+ mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+ Returns:
+ torch.Tensor: Calculated iou of boxes' heights.
+ """
+ assert isinstance(boxes1, CameraInstance3DBoxes)
+ assert isinstance(boxes2, CameraInstance3DBoxes)
+
+ boxes1_top_height = boxes1.top_height.view(-1, 1)
+ boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+ boxes2_top_height = boxes2.top_height.view(1, -1)
+ boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+ # In camera coordinate system
+ # from up to down is the positive direction
+ heighest_of_bottom = torch.min(boxes1_bottom_height,
+ boxes2_bottom_height)
+ lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
+ overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
+ return overlaps_h
+
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`Box3DMode`): The target Box mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from ``src`` coordinates to ``dst`` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: \
+ The converted box of the same type in the ``dst`` mode.
+ """
+ from .box_3d_mode import Box3DMode
+ return Box3DMode.convert(
+ box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/bbox/structures/coord_3d_mode.py b/mmcv/core/bbox/structures/coord_3d_mode.py
new file mode 100644
index 0000000..2d0de8d
--- /dev/null
+++ b/mmcv/core/bbox/structures/coord_3d_mode.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from enum import IntEnum, unique
+
+from mmcv.core.points import (BasePoints, CameraPoints, DepthPoints,
+ LiDARPoints)
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+
+
+@unique
+class Coord3DMode(IntEnum):
+ r"""Enum of different ways to represent a box
+ and point cloud.
+
+ Coordinates in LiDAR:
+
+ .. code-block:: none
+
+ up z
+ ^ x front
+ | /
+ | /
+ left y <------ 0
+
+ The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+ and the yaw is around the z axis, thus the rotation axis=2.
+
+ Coordinates in camera:
+
+ .. code-block:: none
+
+ z front
+ /
+ /
+ 0 ------> x right
+ |
+ |
+ v
+ down y
+
+ The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+ and the yaw is around the y axis, thus the rotation axis=1.
+
+ Coordinates in Depth mode:
+
+ .. code-block:: none
+
+ up z
+ ^ y front
+ | /
+ | /
+ 0 ------> x right
+
+ The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+ and the yaw is around the z axis, thus the rotation axis=2.
+ """
+
+ LIDAR = 0
+ CAM = 1
+ DEPTH = 2
+
+ @staticmethod
+ def convert(input, src, dst, rt_mat=None):
+ """Convert boxes or points from `src` mode to `dst` mode."""
+ if isinstance(input, BaseInstance3DBoxes):
+ return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat)
+ elif isinstance(input, BasePoints):
+ return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
+ else:
+ raise NotImplementedError
+
+ @staticmethod
+ def convert_box(box, src, dst, rt_mat=None):
+ """Convert boxes from `src` mode to `dst` mode.
+
+ Args:
+ box (tuple | list | np.ndarray |
+ torch.Tensor | BaseInstance3DBoxes):
+ Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+ src (:obj:`CoordMode`): The src Box mode.
+ dst (:obj:`CoordMode`): The target Box mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \
+ The converted box of the same type.
+ """
+ if src == dst:
+ return box
+
+ is_numpy = isinstance(box, np.ndarray)
+ is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+ single_box = isinstance(box, (list, tuple))
+ if single_box:
+ assert len(box) >= 7, (
+ 'CoordMode.convert takes either a k-tuple/list or '
+ 'an Nxk array/tensor, where k >= 7')
+ arr = torch.tensor(box)[None, :]
+ else:
+ # avoid modifying the input box
+ if is_numpy:
+ arr = torch.from_numpy(np.asarray(box)).clone()
+ elif is_Instance3DBoxes:
+ arr = box.tensor.clone()
+ else:
+ arr = box.clone()
+
+ # convert box from `src` mode to `dst` mode.
+ x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+ if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+ xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
+ elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+ xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
+ elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+ xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+ elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+ xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+ elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+ xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+ elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+ xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+ else:
+ raise NotImplementedError(
+ f'Conversion from Coord3DMode {src} to {dst} '
+ 'is not supported yet')
+
+ if not isinstance(rt_mat, torch.Tensor):
+ rt_mat = arr.new_tensor(rt_mat)
+ if rt_mat.size(1) == 4:
+ extended_xyz = torch.cat(
+ [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+ xyz = extended_xyz @ rt_mat.t()
+ else:
+ xyz = arr[:, :3] @ rt_mat.t()
+
+ remains = arr[..., 6:]
+ arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
+
+ # convert arr to the original type
+ original_type = type(box)
+ if single_box:
+ return original_type(arr.flatten().tolist())
+ if is_numpy:
+ return arr.numpy()
+ elif is_Instance3DBoxes:
+ if dst == Coord3DMode.CAM:
+ target_type = CameraInstance3DBoxes
+ elif dst == Coord3DMode.LIDAR:
+ target_type = LiDARInstance3DBoxes
+ elif dst == Coord3DMode.DEPTH:
+ target_type = DepthInstance3DBoxes
+ else:
+ raise NotImplementedError(
+ f'Conversion to {dst} through {original_type}'
+ ' is not supported yet')
+ return target_type(
+ arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
+ else:
+ return arr
+
+ @staticmethod
+ def convert_point(point, src, dst, rt_mat=None):
+ """Convert points from `src` mode to `dst` mode.
+
+ Args:
+ point (tuple | list | np.ndarray |
+ torch.Tensor | BasePoints):
+ Can be a k-tuple, k-list or an Nxk array/tensor.
+ src (:obj:`CoordMode`): The src Point mode.
+ dst (:obj:`CoordMode`): The target Point mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ (tuple | list | np.ndarray | torch.Tensor | BasePoints): \
+ The converted point of the same type.
+ """
+ if src == dst:
+ return point
+
+ is_numpy = isinstance(point, np.ndarray)
+ is_InstancePoints = isinstance(point, BasePoints)
+ single_point = isinstance(point, (list, tuple))
+ if single_point:
+ assert len(point) >= 3, (
+ 'CoordMode.convert takes either a k-tuple/list or '
+ 'an Nxk array/tensor, where k >= 3')
+ arr = torch.tensor(point)[None, :]
+ else:
+ # avoid modifying the input point
+ if is_numpy:
+ arr = torch.from_numpy(np.asarray(point)).clone()
+ elif is_InstancePoints:
+ arr = point.tensor.clone()
+ else:
+ arr = point.clone()
+
+ # convert point from `src` mode to `dst` mode.
+ # TODO: LIDAR
+ # only implemented provided Rt matrix in cam-depth conversion
+ if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+ elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+ elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+ elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+ elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+ elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+ else:
+ raise NotImplementedError(
+ f'Conversion from Coord3DMode {src} to {dst} '
+ 'is not supported yet')
+
+ if not isinstance(rt_mat, torch.Tensor):
+ rt_mat = arr.new_tensor(rt_mat)
+ if rt_mat.size(1) == 4:
+ extended_xyz = torch.cat(
+ [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+ xyz = extended_xyz @ rt_mat.t()
+ else:
+ xyz = arr[:, :3] @ rt_mat.t()
+
+ remains = arr[:, 3:]
+ arr = torch.cat([xyz[:, :3], remains], dim=-1)
+
+ # convert arr to the original type
+ original_type = type(point)
+ if single_point:
+ return original_type(arr.flatten().tolist())
+ if is_numpy:
+ return arr.numpy()
+ elif is_InstancePoints:
+ if dst == Coord3DMode.CAM:
+ target_type = CameraPoints
+ elif dst == Coord3DMode.LIDAR:
+ target_type = LiDARPoints
+ elif dst == Coord3DMode.DEPTH:
+ target_type = DepthPoints
+ else:
+ raise NotImplementedError(
+ f'Conversion to {dst} through {original_type}'
+ ' is not supported yet')
+ return target_type(
+ arr,
+ points_dim=arr.size(-1),
+ attribute_dims=point.attribute_dims)
+ else:
+ return arr
diff --git a/mmcv/core/bbox/structures/depth_box3d.py b/mmcv/core/bbox/structures/depth_box3d.py
new file mode 100644
index 0000000..058e975
--- /dev/null
+++ b/mmcv/core/bbox/structures/depth_box3d.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core.points import BasePoints
+from mmcv.ops.roiaware_pool3d import points_in_boxes_batch
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+ """3D boxes of instances in Depth coordinates.
+
+ Coordinates in Depth:
+
+ .. code-block:: none
+
+ up z y front (yaw=-0.5*pi)
+ ^ ^
+ | /
+ | /
+ 0 ------> x right (yaw=0)
+
+ The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+ and the yaw is around the z axis, thus the rotation axis=2.
+ The yaw is 0 at the positive direction of x axis, and decreases from
+ the positive direction of x to the positive direction of y.
+ Also note that rotation of DepthInstance3DBoxes is counterclockwise,
+ which is reverse to the definition of the yaw angle (clockwise).
+
+ A refactor is ongoing to make the three coordinate systems
+ easier to understand and convert between each other.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x box_dim.
+ box_dim (int): Integer indicates the dimension of a box
+ Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+ with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+ boxes.
+ """
+
+ @property
+ def gravity_center(self):
+ """torch.Tensor: A tensor with center of each box."""
+ bottom_center = self.bottom_center
+ gravity_center = torch.zeros_like(bottom_center)
+ gravity_center[:, :2] = bottom_center[:, :2]
+ gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+ return gravity_center
+
+ @property
+ def corners(self):
+ """torch.Tensor: Coordinates of corners of all the boxes
+ in shape (N, 8, 3).
+
+ Convert the boxes to corners in clockwise order, in form of
+ ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+ .. code-block:: none
+
+ up z
+ front y ^
+ / |
+ / |
+ (x0, y1, z1) + ----------- + (x1, y1, z1)
+ /| / |
+ / | / |
+ (x0, y0, z1) + ----------- + + (x1, y1, z0)
+ | / . | /
+ | / origin | /
+ (x0, y0, z0) + ----------- + --------> right x
+ (x1, y0, z0)
+ """
+ # TODO: rotation_3d_in_axis function do not support
+ # empty tensor currently.
+ assert len(self.tensor) != 0
+ dims = self.dims
+ corners_norm = torch.from_numpy(
+ np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+ device=dims.device, dtype=dims.dtype)
+
+ corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+ # use relative origin (0.5, 0.5, 0)
+ corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+ corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+ # rotate around z axis
+ corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+ corners += self.tensor[:, :3].view(-1, 1, 3)
+ return corners
+
+ @property
+ def bev(self):
+ """torch.Tensor: A n x 5 tensor of 2D BEV box of each box
+ in XYWHR format."""
+ return self.tensor[:, [0, 1, 3, 4, 6]]
+
+ @property
+ def nearest_bev(self):
+ """torch.Tensor: A tensor of 2D BEV box of each box
+ without rotation."""
+ # Obtain BEV boxes with rotation in XYWHR format
+ bev_rotated_boxes = self.bev
+ # convert the rotation to a valid range
+ rotations = bev_rotated_boxes[:, -1]
+ normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+ # find the center of boxes
+ conditions = (normed_rotations > np.pi / 4)[..., None]
+ bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+ [0, 1, 3, 2]],
+ bev_rotated_boxes[:, :4])
+
+ centers = bboxes_xywh[:, :2]
+ dims = bboxes_xywh[:, 2:]
+ bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+ return bev_boxes
+
+ def rotate(self, angle, points=None):
+ """Rotate boxes with points (optional) with the given angle or \
+ rotation matrix.
+
+ Args:
+ angle (float | torch.Tensor | np.ndarray):
+ Rotation angle or rotation matrix.
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+ Points to rotate. Defaults to None.
+
+ Returns:
+ tuple or None: When ``points`` is None, the function returns \
+ None, otherwise it returns the rotated points and the \
+ rotation matrix ``rot_mat_T``.
+ """
+ if not isinstance(angle, torch.Tensor):
+ angle = self.tensor.new_tensor(angle)
+ assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+ f'invalid rotation angle shape {angle.shape}'
+
+ if angle.numel() == 1:
+ rot_sin = torch.sin(angle)
+ rot_cos = torch.cos(angle)
+ rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
+ [rot_sin, rot_cos, 0],
+ [0, 0, 1]]).T
+ else:
+ rot_mat_T = angle.T
+ rot_sin = rot_mat_T[0, 1]
+ rot_cos = rot_mat_T[0, 0]
+ angle = np.arctan2(rot_sin, rot_cos)
+
+ self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+ if self.with_yaw:
+ self.tensor[:, 6] -= angle
+ else:
+ corners_rot = self.corners @ rot_mat_T
+ new_x_size = corners_rot[..., 0].max(
+ dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
+ dim=1, keepdim=True)[0]
+ new_y_size = corners_rot[..., 1].max(
+ dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
+ dim=1, keepdim=True)[0]
+ self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+ if points is not None:
+ if isinstance(points, torch.Tensor):
+ points[:, :3] = points[:, :3] @ rot_mat_T
+ elif isinstance(points, np.ndarray):
+ rot_mat_T = rot_mat_T.numpy()
+ points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+ elif isinstance(points, BasePoints):
+ # anti-clockwise
+ points.rotate(angle)
+ else:
+ raise ValueError
+ return points, rot_mat_T
+
+ def flip(self, bev_direction='horizontal', points=None):
+ """Flip the boxes in BEV along given BEV direction.
+
+ In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
+
+ Args:
+ bev_direction (str): Flip direction (horizontal or vertical).
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+ Points to flip. Defaults to None.
+
+ Returns:
+ torch.Tensor, numpy.ndarray or None: Flipped points.
+ """
+ assert bev_direction in ('horizontal', 'vertical')
+ if bev_direction == 'horizontal':
+ self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+ if self.with_yaw:
+ self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+ elif bev_direction == 'vertical':
+ self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+ if self.with_yaw:
+ self.tensor[:, 6] = -self.tensor[:, 6]
+
+ if points is not None:
+ assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+ if isinstance(points, (torch.Tensor, np.ndarray)):
+ if bev_direction == 'horizontal':
+ points[:, 0] = -points[:, 0]
+ elif bev_direction == 'vertical':
+ points[:, 1] = -points[:, 1]
+ elif isinstance(points, BasePoints):
+ points.flip(bev_direction)
+ return points
+
+ def in_range_bev(self, box_range):
+ """Check whether the boxes are in the given range.
+
+ Args:
+ box_range (list | torch.Tensor): The range of box
+ (x_min, y_min, x_max, y_max).
+
+ Note:
+ In the original implementation of SECOND, checking whether
+ a box in the range checks whether the points are in a convex
+ polygon, we try to reduce the burdun for simpler cases.
+
+ Returns:
+ torch.Tensor: Indicating whether each box is inside \
+ the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > box_range[0])
+ & (self.tensor[:, 1] > box_range[1])
+ & (self.tensor[:, 0] < box_range[2])
+ & (self.tensor[:, 1] < box_range[3]))
+ return in_range_flags
+
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`Box3DMode`): The target Box mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from ``src`` coordinates to ``dst`` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`DepthInstance3DBoxes`: \
+ The converted box of the same type in the ``dst`` mode.
+ """
+ from .box_3d_mode import Box3DMode
+ return Box3DMode.convert(
+ box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
+
+ def points_in_boxes(self, points):
+ """Find points that are in boxes (CUDA).
+
+ Args:
+ points (torch.Tensor): Points in shape [1, M, 3] or [M, 3], \
+ 3 dimensions are [x, y, z] in LiDAR coordinate.
+
+ Returns:
+ torch.Tensor: The index of boxes each point lies in with shape \
+ of (B, M, T).
+ """
+ from .box_3d_mode import Box3DMode
+
+ # to lidar
+ points_lidar = points.clone()
+ points_lidar = points_lidar[..., [1, 0, 2]]
+ points_lidar[..., 1] *= -1
+ if points.dim() == 2:
+ points_lidar = points_lidar.unsqueeze(0)
+ else:
+ assert points.dim() == 3 and points_lidar.shape[0] == 1
+
+ boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor
+ boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0)
+ box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar)
+
+ return box_idxs_of_pts.squeeze(0)
+
+ def enlarged_box(self, extra_width):
+ """Enlarge the length, width and height boxes.
+
+ Args:
+ extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+ Returns:
+ :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+ """
+ enlarged_boxes = self.tensor.clone()
+ enlarged_boxes[:, 3:6] += extra_width * 2
+ # bottom center z minus extra_width
+ enlarged_boxes[:, 2] -= extra_width
+ return self.new_box(enlarged_boxes)
+
+ def get_surface_line_center(self):
+ """Compute surface and line center of bounding boxes.
+
+ Returns:
+ torch.Tensor: Surface and line center of bounding boxes.
+ """
+ obj_size = self.dims
+ center = self.gravity_center.view(-1, 1, 3)
+ batch_size = center.shape[0]
+
+ rot_sin = torch.sin(-self.yaw)
+ rot_cos = torch.cos(-self.yaw)
+ rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
+ rot_mat_T[..., 0, 0] = rot_cos
+ rot_mat_T[..., 0, 1] = -rot_sin
+ rot_mat_T[..., 1, 0] = rot_sin
+ rot_mat_T[..., 1, 1] = rot_cos
+ rot_mat_T[..., 2, 2] = 1
+
+ # Get the object surface center
+ offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
+ [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
+ offset = offset.view(1, 6, 3) / 2
+ surface_3d = (offset *
+ obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
+ -1, 3)
+
+ # Get the object line center
+ offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
+ [0, -1, 1], [1, 0, -1], [-1, 0, -1],
+ [0, 1, -1], [0, -1, -1], [1, 1, 0],
+ [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
+ offset = offset.view(1, 12, 3) / 2
+
+ line_3d = (offset *
+ obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
+ -1, 3)
+
+ surface_rot = rot_mat_T.repeat(6, 1, 1)
+ surface_3d = torch.matmul(
+ surface_3d.unsqueeze(-2), surface_rot.transpose(2, 1)).squeeze(-2)
+ surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
+
+ line_rot = rot_mat_T.repeat(12, 1, 1)
+ line_3d = torch.matmul(
+ line_3d.unsqueeze(-2), line_rot.transpose(2, 1)).squeeze(-2)
+ line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
+
+ return surface_center, line_center
diff --git a/mmcv/core/bbox/structures/lidar_box3d.py b/mmcv/core/bbox/structures/lidar_box3d.py
new file mode 100644
index 0000000..f7e7ec2
--- /dev/null
+++ b/mmcv/core/bbox/structures/lidar_box3d.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core.points import BasePoints
+from mmcv.ops.roiaware_pool3d import points_in_boxes_gpu
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class LiDARInstance3DBoxes(BaseInstance3DBoxes):
+ """3D boxes of instances in LIDAR coordinates.
+
+ Coordinates in LiDAR:
+
+ .. code-block:: none
+
+ up z x front (yaw=-0.5*pi)
+ ^ ^
+ | /
+ | /
+ (yaw=-pi) left y <------ 0 -------- (yaw=0)
+
+ The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+ and the yaw is around the z axis, thus the rotation axis=2.
+ The yaw is 0 at the negative direction of y axis, and decreases from
+ the negative direction of y to the positive direction of x.
+
+ A refactor is ongoing to make the three coordinate systems
+ easier to understand and convert between each other.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x box_dim.
+ box_dim (int): Integer indicating the dimension of a box.
+ Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+ with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+ boxes.
+ """
+
+ @property
+ def gravity_center(self):
+ """torch.Tensor: A tensor with center of each box."""
+ bottom_center = self.bottom_center
+ gravity_center = torch.zeros_like(bottom_center)
+ gravity_center[:, :2] = bottom_center[:, :2]
+ gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+ return gravity_center
+
+ @property
+ def corners(self):
+ """torch.Tensor: Coordinates of corners of all the boxes
+ in shape (N, 8, 3).
+
+ Convert the boxes to corners in clockwise order, in form of
+ ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+ .. code-block:: none
+
+ up z
+ front x ^
+ / |
+ / |
+ (x1, y0, z1) + ----------- + (x1, y1, z1)
+ /| / |
+ / | / |
+ (x0, y0, z1) + ----------- + + (x1, y1, z0)
+ | / . | /
+ | / origin | /
+ left y<-------- + ----------- + (x0, y1, z0)
+ (x0, y0, z0)
+ """
+ # TODO: rotation_3d_in_axis function do not support
+ # empty tensor currently.
+ assert len(self.tensor) != 0
+ dims = self.dims
+ corners_norm = torch.from_numpy(
+ np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+ device=dims.device, dtype=dims.dtype)
+
+ corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+ # use relative origin [0.5, 0.5, 0]
+ corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+ corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+ # rotate around z axis
+ corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+ corners += self.tensor[:, :3].view(-1, 1, 3)
+ return corners
+
+ @property
+ def bev(self):
+ """torch.Tensor: 2D BEV box of each box with rotation
+ in XYWHR format."""
+ return self.tensor[:, [0, 1, 3, 4, 6]]
+
+ @property
+ def nearest_bev(self):
+ """torch.Tensor: A tensor of 2D BEV box of each box
+ without rotation."""
+ # Obtain BEV boxes with rotation in XYWHR format
+ bev_rotated_boxes = self.bev
+ # convert the rotation to a valid range
+ rotations = bev_rotated_boxes[:, -1]
+ normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+ # find the center of boxes
+ conditions = (normed_rotations > np.pi / 4)[..., None]
+ bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+ [0, 1, 3, 2]],
+ bev_rotated_boxes[:, :4])
+
+ centers = bboxes_xywh[:, :2]
+ dims = bboxes_xywh[:, 2:]
+ bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+ return bev_boxes
+
+ def rotate(self, angle, points=None):
+ """Rotate boxes with points (optional) with the given angle or \
+ rotation matrix.
+
+ Args:
+ angles (float | torch.Tensor | np.ndarray):
+ Rotation angle or rotation matrix.
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+ Points to rotate. Defaults to None.
+
+ Returns:
+ tuple or None: When ``points`` is None, the function returns \
+ None, otherwise it returns the rotated points and the \
+ rotation matrix ``rot_mat_T``.
+ """
+ if not isinstance(angle, torch.Tensor):
+ angle = self.tensor.new_tensor(angle)
+ assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+ f'invalid rotation angle shape {angle.shape}'
+
+ if angle.numel() == 1:
+ rot_sin = torch.sin(angle)
+ rot_cos = torch.cos(angle)
+ rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
+ [rot_sin, rot_cos, 0],
+ [0, 0, 1]])
+ else:
+ rot_mat_T = angle
+ rot_sin = rot_mat_T[1, 0]
+ rot_cos = rot_mat_T[0, 0]
+ angle = np.arctan2(rot_sin, rot_cos)
+
+ self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+ self.tensor[:, 6] += angle
+
+ if self.tensor.shape[1] == 9:
+ # rotate velo vector
+ self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+ if points is not None:
+ if isinstance(points, torch.Tensor):
+ points[:, :3] = points[:, :3] @ rot_mat_T
+ elif isinstance(points, np.ndarray):
+ rot_mat_T = rot_mat_T.numpy()
+ points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+ elif isinstance(points, BasePoints):
+ # clockwise
+ points.rotate(-angle)
+ else:
+ raise ValueError
+ return points, rot_mat_T
+
+ def flip(self, bev_direction='horizontal', points=None):
+ """Flip the boxes in BEV along given BEV direction.
+
+ In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+ Args:
+ bev_direction (str): Flip direction (horizontal or vertical).
+ points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+ Points to flip. Defaults to None.
+
+ Returns:
+ torch.Tensor, numpy.ndarray or None: Flipped points.
+ """
+ assert bev_direction in ('horizontal', 'vertical')
+ if bev_direction == 'horizontal':
+ self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+ if self.with_yaw:
+ self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+ elif bev_direction == 'vertical':
+ self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+ if self.with_yaw:
+ self.tensor[:, 6] = -self.tensor[:, 6]
+
+ if points is not None:
+ assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+ if isinstance(points, (torch.Tensor, np.ndarray)):
+ if bev_direction == 'horizontal':
+ points[:, 1] = -points[:, 1]
+ elif bev_direction == 'vertical':
+ points[:, 0] = -points[:, 0]
+ elif isinstance(points, BasePoints):
+ points.flip(bev_direction)
+ return points
+
+ def in_range_bev(self, box_range):
+ """Check whether the boxes are in the given range.
+
+ Args:
+ box_range (list | torch.Tensor): the range of box
+ (x_min, y_min, x_max, y_max)
+
+ Note:
+ The original implementation of SECOND checks whether boxes in
+ a range by checking whether the points are in a convex
+ polygon, we reduce the burden for simpler cases.
+
+ Returns:
+ torch.Tensor: Whether each box is inside the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > box_range[0])
+ & (self.tensor[:, 1] > box_range[1])
+ & (self.tensor[:, 0] < box_range[2])
+ & (self.tensor[:, 1] < box_range[3]))
+ return in_range_flags
+
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`Box3DMode`): the target Box mode
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from ``src`` coordinates to ``dst`` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: \
+ The converted box of the same type in the ``dst`` mode.
+ """
+ from .box_3d_mode import Box3DMode
+ return Box3DMode.convert(
+ box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
+
+ def enlarged_box(self, extra_width):
+ """Enlarge the length, width and height boxes.
+
+ Args:
+ extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+ Returns:
+ :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+ """
+ enlarged_boxes = self.tensor.clone()
+ enlarged_boxes[:, 3:6] += extra_width * 2
+ # bottom center z minus extra_width
+ enlarged_boxes[:, 2] -= extra_width
+ return self.new_box(enlarged_boxes)
+
+ def points_in_boxes(self, points):
+ """Find the box which the points are in.
+
+ Args:
+ points (torch.Tensor): Points in shape (N, 3).
+
+ Returns:
+ torch.Tensor: The index of box where each point are in.
+ """
+ box_idx = points_in_boxes_gpu(
+ points.unsqueeze(0),
+ self.tensor.unsqueeze(0).to(points.device)).squeeze(0)
+ return box_idx
diff --git a/mmcv/core/bbox/structures/nuscenes_box.py b/mmcv/core/bbox/structures/nuscenes_box.py
new file mode 100644
index 0000000..05200a0
--- /dev/null
+++ b/mmcv/core/bbox/structures/nuscenes_box.py
@@ -0,0 +1,458 @@
+# nuScenes dev-kit.
+# Code written by Oscar Beijbom, 2018.
+
+import copy
+from typing import Tuple, List
+
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.collections import LineCollection
+from pyquaternion import Quaternion
+from nuscenes.utils.geometry_utils import view_points
+from nuscenes.eval.common.data_classes import EvalBox
+from nuscenes.eval.detection.constants import DETECTION_NAMES, ATTRIBUTE_NAMES
+
+
+def color_map(data, cmap):
+ """数值映射为颜色"""
+
+ dmin, dmax = np.nanmin(data), np.nanmax(data)
+ cmo = plt.cm.get_cmap(cmap)
+ cs, k = list(), 256/cmo.N
+
+ for i in range(cmo.N):
+ c = cmo(i)
+ for j in range(int(i*k), int((i+1)*k)):
+ cs.append(c)
+ cs = np.array(cs)
+ data = np.uint8(255*(data-dmin)/(dmax-dmin))
+
+ return cs[data]
+
+class CustomNuscenesBox:
+ """ Simple data class representing a 3d box including, label, score and velocity. """
+
+ def __init__(self,
+ center: List[float],
+ size: List[float],
+ orientation: Quaternion,
+ fut_trajs: List[float],
+ label: int = np.nan,
+ score: float = np.nan,
+ velocity: Tuple = (np.nan, np.nan, np.nan),
+ name: str = None,
+ token: str = None):
+ """
+ :param center: Center of box given as x, y, z.
+ :param size: Size of box in width, length, height.
+ :param orientation: Box orientation.
+ :param label: Integer label, optional.
+ :param score: Classification score, optional.
+ :param velocity: Box velocity in x, y, z direction.
+ :param name: Box name, optional. Can be used e.g. for denote category name.
+ :param token: Unique string identifier from DB.
+ """
+ assert not np.any(np.isnan(center))
+ assert not np.any(np.isnan(size))
+ assert len(center) == 3
+ assert len(size) == 3
+ assert type(orientation) == Quaternion
+
+ self.center = np.array(center)
+ self.wlh = np.array(size)
+ self.orientation = orientation
+ self.label = int(label) if not np.isnan(label) else label
+ self.score = float(score) if not np.isnan(score) else score
+ self.velocity = np.array(velocity)
+ self.name = name
+ self.token = token
+ self.fut_trajs = np.array(fut_trajs)
+
+ def __eq__(self, other):
+ center = np.allclose(self.center, other.center)
+ wlh = np.allclose(self.wlh, other.wlh)
+ orientation = np.allclose(self.orientation.elements, other.orientation.elements)
+ label = (self.label == other.label) or (np.isnan(self.label) and np.isnan(other.label))
+ score = (self.score == other.score) or (np.isnan(self.score) and np.isnan(other.score))
+ vel = (np.allclose(self.velocity, other.velocity) or
+ (np.all(np.isnan(self.velocity)) and np.all(np.isnan(other.velocity))))
+
+ return center and wlh and orientation and label and score and vel
+
+ def __repr__(self):
+ repr_str = 'label: {}, score: {:.2f}, xyz: [{:.2f}, {:.2f}, {:.2f}], wlh: [{:.2f}, {:.2f}, {:.2f}], ' \
+ 'rot axis: [{:.2f}, {:.2f}, {:.2f}], ang(degrees): {:.2f}, ang(rad): {:.2f}, ' \
+ 'vel: {:.2f}, {:.2f}, {:.2f}, name: {}, token: {}'
+
+ return repr_str.format(self.label, self.score, self.center[0], self.center[1], self.center[2], self.wlh[0],
+ self.wlh[1], self.wlh[2], self.orientation.axis[0], self.orientation.axis[1],
+ self.orientation.axis[2], self.orientation.degrees, self.orientation.radians,
+ self.velocity[0], self.velocity[1], self.velocity[2], self.name, self.token)
+
+ @property
+ def rotation_matrix(self) -> np.ndarray:
+ """
+ Return a rotation matrix.
+ :return: . The box's rotation matrix.
+ """
+ return self.orientation.rotation_matrix
+
+ def translate(self, x: np.ndarray) -> None:
+ """
+ Applies a translation.
+ :param x: . Translation in x, y, z direction.
+ """
+ self.center += x
+
+ def rotate(self, quaternion: Quaternion) -> None:
+ """
+ Rotates box.
+ :param quaternion: Rotation to apply.
+ """
+ self.center = np.dot(quaternion.rotation_matrix, self.center)
+ self.orientation = quaternion * self.orientation
+ self.velocity = np.dot(quaternion.rotation_matrix, self.velocity)
+
+ def corners(self, wlh_factor: float = 1.0) -> np.ndarray:
+ """
+ Returns the bounding box corners.
+ :param wlh_factor: Multiply w, l, h by a factor to scale the box.
+ :return: . First four corners are the ones facing forward.
+ The last four are the ones facing backwards.
+ """
+ w, l, h = self.wlh * wlh_factor
+
+ # 3D bounding box corners. (Convention: x points forward, y to the left, z up.)
+ x_corners = l / 2 * np.array([1, 1, 1, 1, -1, -1, -1, -1])
+ y_corners = w / 2 * np.array([1, -1, -1, 1, 1, -1, -1, 1])
+ z_corners = h / 2 * np.array([1, 1, -1, -1, 1, 1, -1, -1])
+ corners = np.vstack((x_corners, y_corners, z_corners))
+
+ # Rotate
+ corners = np.dot(self.orientation.rotation_matrix, corners)
+
+ # Translate
+ x, y, z = self.center
+ corners[0, :] = corners[0, :] + x
+ corners[1, :] = corners[1, :] + y
+ corners[2, :] = corners[2, :] + z
+
+ return corners
+
+ def bottom_corners(self) -> np.ndarray:
+ """
+ Returns the four bottom corners.
+ :return: . Bottom corners. First two face forward, last two face backwards.
+ """
+ return self.corners()[:, [2, 3, 7, 6]]
+
+ def render(self,
+ axis: Axes,
+ view: np.ndarray = np.eye(3),
+ normalize: bool = False,
+ colors: Tuple = ('b', 'r', 'k'),
+ linewidth: float = 2,
+ box_idx=None,
+ alpha=0.5) -> None:
+ """
+ Renders the box in the provided Matplotlib axis.
+ :param axis: Axis onto which the box should be drawn.
+ :param view: . Define a projection in needed (e.g. for drawing projection in an image).
+ :param normalize: Whether to normalize the remaining coordinate.
+ :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front,
+ back and sides.
+ :param linewidth: Width in pixel of the box sides.
+ """
+ corners = view_points(self.corners(), view, normalize=normalize)[:2, :]
+
+ def draw_rect(selected_corners, color, alpha):
+ prev = selected_corners[-1]
+ for corner in selected_corners:
+ axis.plot([prev[0], corner[0]], [prev[1], corner[1]], color=color, linewidth=linewidth, alpha=alpha)
+ prev = corner
+
+ # Draw the sides
+ for i in range(4):
+ axis.plot([corners.T[i][0], corners.T[i + 4][0]],
+ [corners.T[i][1], corners.T[i + 4][1]],
+ color=colors[2], linewidth=linewidth, alpha=alpha)
+
+ # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d)
+ draw_rect(corners.T[:4], colors[0], alpha)
+ draw_rect(corners.T[4:], colors[1], alpha)
+
+ # Draw line indicating the front
+ center_bottom_forward = np.mean(corners.T[2:4], axis=0)
+ center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0)
+ axis.plot([center_bottom[0], center_bottom_forward[0]],
+ [center_bottom[1], center_bottom_forward[1]],
+ color=colors[0], linewidth=linewidth, alpha=alpha)
+ if box_idx is not None and center_bottom[0] > -35 and center_bottom[1] > -35 \
+ and center_bottom[0] < 35 and center_bottom[1] < 35:
+ text = f'{box_idx}'
+ axis.text(center_bottom[0], center_bottom[1], text, ha='left', fontsize=5)
+
+ def render_fut_trajs(self,
+ axis: Axes,
+ color: str = 'b',
+ linewidth: float = 1,
+ fut_ts: int = 6,
+ mode_idx=None) -> None:
+ """
+ Renders the box in the provided Matplotlib axis.
+ :param axis: Axis onto which the box should be drawn.
+ :param view: . Define a projection in needed (e.g. for drawing projection in an image).
+ :param normalize: Whether to normalize the remaining coordinate.
+ :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front,
+ back and sides.
+ :param linewidth: Width in pixel of the box sides.
+ """
+
+ fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+ if mode_idx is not None:
+ fut_coords = fut_coords[[mode_idx]]
+ alpha = 0.8
+ for i in range(fut_coords.shape[0]):
+ fut_coord = fut_coords[i]
+ fut_coord = fut_coord.cumsum(axis=-2)
+ fut_coord = fut_coord + self.center[:2]
+ if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10:
+ if color == 'g':
+ axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha)
+ elif color == 'b':
+ axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha)
+ if mode_idx is None and fut_coord[-1, 0] > -35 and fut_coord[-1, 1] > -35 \
+ and fut_coord[-1, 0] < 35 and fut_coord[-1, 1] < 35:
+ text = f'{i}'
+ axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5)
+ axis.plot(
+ [self.center[0], fut_coord[0, 0]],
+ [self.center[1], fut_coord[0, 1]],
+ color=color, linewidth=linewidth, alpha=alpha
+ )
+ for i in range(fut_coord.shape[0]-1):
+ axis.plot(
+ [fut_coord[i, 0], fut_coord[i+1, 0]],
+ [fut_coord[i, 1], fut_coord[i+1, 1]],
+ color=color, linewidth=linewidth, alpha=alpha
+ )
+
+ def render_fut_trajs_grad_color(self,
+ axis: Axes,
+ linewidth: float = 1,
+ linestyles='solid',
+ cmap='viridis',
+ fut_ts: int = 6,
+ alpha: int = 0.8,
+ mode_idx=None) -> None:
+ """
+ Renders the box in the provided Matplotlib axis.
+ :param axis: Axis onto which the box should be drawn.
+ :param view: . Define a projection in needed (e.g. for drawing projection in an image).
+ :param normalize: Whether to normalize the remaining coordinate.
+ :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front,
+ back and sides.
+ :param linewidth: Width in pixel of the box sides.
+ """
+
+ fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+ if mode_idx is not None:
+ fut_coords = fut_coords[[mode_idx]]
+
+ for i in range(fut_coords.shape[0]):
+ fut_coord = fut_coords[i]
+ fut_coord = fut_coord.cumsum(axis=-2)
+ fut_coord = fut_coord + self.center[:2]
+ fut_coord = np.concatenate((self.center[np.newaxis, :2], fut_coord), axis=0)
+ fut_coord_segments = np.stack((fut_coord[:-1], fut_coord[1:]), axis=1)
+
+ fut_vecs = None
+ for j in range(fut_coord_segments.shape[0]):
+ fut_vec_j = fut_coord_segments[j]
+ x_linspace = np.linspace(fut_vec_j[0, 0], fut_vec_j[1, 0], 51)
+ y_linspace = np.linspace(fut_vec_j[0, 1], fut_vec_j[1, 1], 51)
+ xy = np.stack((x_linspace, y_linspace), axis=1)
+ xy = np.stack((xy[:-1], xy[1:]), axis=1)
+ if fut_vecs is None:
+ fut_vecs = xy
+ else:
+ fut_vecs = np.concatenate((fut_vecs, xy), axis=0)
+
+ y = np.sin(np.linspace(3/2*np.pi, 5/2*np.pi, 301))
+ colors = color_map(y[:-1], cmap)
+ line_segments = LineCollection(fut_vecs, colors=colors, linewidths=linewidth, linestyles=linestyles, cmap=cmap)
+
+ # if mode_idx is None and abs(fut_coord[-1, 0]) < 35 and abs(fut_coord[-1, 1]) < 35:
+ # text = f'{i}'
+ # axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5)
+
+ axis.add_collection(line_segments)
+
+ def render_fut_trajs_coords(self,
+ axis: Axes,
+ color: str = 'b',
+ linewidth: float = 1,
+ fut_ts: int = 12) -> None:
+ """
+ Renders the box in the provided Matplotlib axis.
+ :param axis: Axis onto which the box should be drawn.
+ :param view: . Define a projection in needed (e.g. for drawing projection in an image).
+ :param normalize: Whether to normalize the remaining coordinate.
+ :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front,
+ back and sides.
+ :param linewidth: Width in pixel of the box sides.
+ """
+
+ fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+ alpha = 0.2 if color == 'b' else 1
+ for i in range(fut_coords.shape[0]):
+ fut_coord = fut_coords[i]
+ fut_coord = fut_coord + self.center[:2]
+ if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10:
+ if color == 'g':
+ axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha)
+ elif color == 'b':
+ axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha)
+ axis.plot(
+ [self.center[0], fut_coord[0, 0]],
+ [self.center[1], fut_coord[0, 1]],
+ color=color, linewidth=linewidth, alpha=alpha
+ )
+ for i in range(fut_coord.shape[0]-1):
+ axis.plot(
+ [fut_coord[i, 0], fut_coord[i+1, 0]],
+ [fut_coord[i, 1], fut_coord[i+1, 1]],
+ color=color, linewidth=linewidth, alpha=alpha
+ )
+
+ def render_cv2(self,
+ im: np.ndarray,
+ view: np.ndarray = np.eye(3),
+ normalize: bool = False,
+ colors: Tuple = ((0, 0, 255), (255, 0, 0), (155, 155, 155)),
+ linewidth: int = 2) -> None:
+ """
+ Renders box using OpenCV2.
+ :param im: . Image array. Channels are in BGR order.
+ :param view: . Define a projection if needed (e.g. for drawing projection in an image).
+ :param normalize: Whether to normalize the remaining coordinate.
+ :param colors: ((R, G, B), (R, G, B), (R, G, B)). Colors for front, side & rear.
+ :param linewidth: Linewidth for plot.
+ """
+ corners = view_points(self.corners(), view, normalize=normalize)[:2, :]
+
+ def draw_rect(selected_corners, color):
+ prev = selected_corners[-1]
+ for corner in selected_corners:
+ cv2.line(im,
+ (int(prev[0]), int(prev[1])),
+ (int(corner[0]), int(corner[1])),
+ color, linewidth)
+ prev = corner
+
+ # Draw the sides
+ for i in range(4):
+ cv2.line(im,
+ (int(corners.T[i][0]), int(corners.T[i][1])),
+ (int(corners.T[i + 4][0]), int(corners.T[i + 4][1])),
+ colors[2][::-1], linewidth)
+
+ # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d)
+ draw_rect(corners.T[:4], colors[0][::-1])
+ draw_rect(corners.T[4:], colors[1][::-1])
+
+ # Draw line indicating the front
+ center_bottom_forward = np.mean(corners.T[2:4], axis=0)
+ center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0)
+ cv2.line(im,
+ (int(center_bottom[0]), int(center_bottom[1])),
+ (int(center_bottom_forward[0]), int(center_bottom_forward[1])),
+ colors[0][::-1], linewidth)
+
+ def copy(self) -> 'CustomNuscenesBox':
+ """
+ Create a copy of self.
+ :return: A copy.
+ """
+ return copy.deepcopy(self)
+
+
+class CustomDetectionBox(EvalBox):
+ """ Data class used during detection evaluation. Can be a prediction or ground truth."""
+
+ def __init__(self,
+ sample_token: str = "",
+ translation: Tuple[float, float, float] = (0, 0, 0),
+ size: Tuple[float, float, float] = (0, 0, 0),
+ rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+ velocity: Tuple[float, float] = (0, 0),
+ ego_translation: Tuple[float, float, float] = (0, 0, 0), # Translation to ego vehicle in meters.
+ num_pts: int = -1, # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+ detection_name: str = 'car', # The class name used in the detection challenge.
+ detection_score: float = -1.0, # GT samples do not have a score.
+ attribute_name: str = '', # Box attribute. Each box can have at most 1 attribute.
+ fut_trajs=None): # future trajectories of a pred box, shape=[fut_ts*2].
+
+ super().__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts)
+
+ assert detection_name is not None, 'Error: detection_name cannot be empty!'
+ assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+ assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+ 'Error: Unknown attribute_name %s' % attribute_name
+
+ assert type(detection_score) == float, 'Error: detection_score must be a float!'
+ assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+
+ # Assign.
+ self.detection_name = detection_name
+ self.detection_score = detection_score
+ self.attribute_name = attribute_name
+ self.fut_trajs = fut_trajs
+
+ def __eq__(self, other):
+ return (self.sample_token == other.sample_token and
+ self.translation == other.translation and
+ self.size == other.size and
+ self.rotation == other.rotation and
+ self.velocity == other.velocity and
+ self.ego_translation == other.ego_translation and
+ self.num_pts == other.num_pts and
+ self.detection_name == other.detection_name and
+ self.detection_score == other.detection_score and
+ self.attribute_name == other.attribute_name and
+ self.fut_trajs == other.fut_trajs)
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'fut_trajs': self.fut_trajs
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ fut_trajs=tuple(content['fut_trajs']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'])
diff --git a/mmcv/core/bbox/structures/utils.py b/mmcv/core/bbox/structures/utils.py
new file mode 100644
index 0000000..842131f
--- /dev/null
+++ b/mmcv/core/bbox/structures/utils.py
@@ -0,0 +1,214 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from logging import warning
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+ """Limit the value into a period for periodic function.
+
+ Args:
+ val (torch.Tensor): The value to be converted.
+ offset (float, optional): Offset to set the value range. \
+ Defaults to 0.5.
+ period ([type], optional): Period of the value. Defaults to np.pi.
+
+ Returns:
+ torch.Tensor: Value in the range of \
+ [-offset * period, (1-offset) * period]
+ """
+ return val - torch.floor(val / period + offset) * period
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+ """Rotate points by angles according to axis.
+
+ Args:
+ points (torch.Tensor): Points of shape (N, M, 3).
+ angles (torch.Tensor): Vector of angles in shape (N,)
+ axis (int, optional): The axis to be rotated. Defaults to 0.
+
+ Raises:
+ ValueError: when the axis is not in range [0, 1, 2], it will \
+ raise value error.
+
+ Returns:
+ torch.Tensor: Rotated points in shape (N, M, 3)
+ """
+ rot_sin = torch.sin(angles)
+ rot_cos = torch.cos(angles)
+ ones = torch.ones_like(rot_cos)
+ zeros = torch.zeros_like(rot_cos)
+ if axis == 1:
+ rot_mat_T = torch.stack([
+ torch.stack([rot_cos, zeros, -rot_sin]),
+ torch.stack([zeros, ones, zeros]),
+ torch.stack([rot_sin, zeros, rot_cos])
+ ])
+ elif axis == 2 or axis == -1:
+ rot_mat_T = torch.stack([
+ torch.stack([rot_cos, -rot_sin, zeros]),
+ torch.stack([rot_sin, rot_cos, zeros]),
+ torch.stack([zeros, zeros, ones])
+ ])
+ elif axis == 0:
+ rot_mat_T = torch.stack([
+ torch.stack([zeros, rot_cos, -rot_sin]),
+ torch.stack([zeros, rot_sin, rot_cos]),
+ torch.stack([ones, zeros, zeros])
+ ])
+ else:
+ raise ValueError(f'axis should in range [0, 1, 2], got {axis}')
+
+ return torch.einsum('aij,jka->aik', (points, rot_mat_T))
+
+
+def xywhr2xyxyr(boxes_xywhr):
+ """Convert a rotated boxes in XYWHR format to XYXYR format.
+
+ Args:
+ boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format.
+
+ Returns:
+ torch.Tensor: Converted boxes in XYXYR format.
+ """
+ boxes = torch.zeros_like(boxes_xywhr)
+ half_w = boxes_xywhr[:, 2] / 2
+ half_h = boxes_xywhr[:, 3] / 2
+
+ boxes[:, 0] = boxes_xywhr[:, 0] - half_w
+ boxes[:, 1] = boxes_xywhr[:, 1] - half_h
+ boxes[:, 2] = boxes_xywhr[:, 0] + half_w
+ boxes[:, 3] = boxes_xywhr[:, 1] + half_h
+ boxes[:, 4] = boxes_xywhr[:, 4]
+ return boxes
+
+
+def get_box_type(box_type):
+ """Get the type and mode of box structure.
+
+ Args:
+ box_type (str): The type of box structure.
+ The valid value are "LiDAR", "Camera", or "Depth".
+
+ Returns:
+ tuple: Box type and box mode.
+ """
+ from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
+ DepthInstance3DBoxes, LiDARInstance3DBoxes)
+ box_type_lower = box_type.lower()
+ if box_type_lower == 'lidar':
+ box_type_3d = LiDARInstance3DBoxes
+ box_mode_3d = Box3DMode.LIDAR
+ elif box_type_lower == 'camera':
+ box_type_3d = CameraInstance3DBoxes
+ box_mode_3d = Box3DMode.CAM
+ elif box_type_lower == 'depth':
+ box_type_3d = DepthInstance3DBoxes
+ box_mode_3d = Box3DMode.DEPTH
+ else:
+ raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
+ f' are supported, got {box_type}')
+
+ return box_type_3d, box_mode_3d
+
+
+def points_cam2img(points_3d, proj_mat, with_depth=False):
+ """Project points from camera coordicates to image coordinates.
+
+ Args:
+ points_3d (torch.Tensor): Points in shape (N, 3).
+ proj_mat (torch.Tensor): Transformation matrix between coordinates.
+ with_depth (bool, optional): Whether to keep depth in the output.
+ Defaults to False.
+
+ Returns:
+ torch.Tensor: Points in image coordinates with shape [N, 2].
+ """
+ points_num = list(points_3d.shape)[:-1]
+
+ points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
+ assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+ f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+ d1, d2 = proj_mat.shape[:2]
+ assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+ d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+ f' ({d1}*{d2}) is not supported.'
+ if d1 == 3:
+ proj_mat_expanded = torch.eye(
+ 4, device=proj_mat.device, dtype=proj_mat.dtype)
+ proj_mat_expanded[:d1, :d2] = proj_mat
+ proj_mat = proj_mat_expanded
+
+ # previous implementation use new_zeros, new_one yeilds better results
+ points_4 = torch.cat(
+ [points_3d, points_3d.new_ones(*points_shape)], dim=-1)
+ point_2d = torch.matmul(points_4, proj_mat.t())
+ point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+ if with_depth:
+ return torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+ return point_2d_res
+
+
+def mono_cam_box2vis(cam_box):
+ """This is a post-processing function on the bboxes from Mono-3D task. If
+ we want to perform projection visualization, we need to:
+
+ 1. rotate the box along x-axis for np.pi / 2 (roll)
+ 2. change orientation from local yaw to global yaw
+ 3. convert yaw by (np.pi / 2 - yaw)
+
+ After applying this function, we can project and draw it on 2D images.
+
+ Args:
+ cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate \
+ system before conversion. Could be gt bbox loaded from dataset or \
+ network prediction output.
+
+ Returns:
+ :obj:`CameraInstance3DBoxes`: Box after conversion.
+ """
+ warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
+ 'monocular 3D detection on nuScenes has been removed. The '
+ 'function mono_cam_box2vis will be deprecated.')
+ from . import CameraInstance3DBoxes
+ assert isinstance(cam_box, CameraInstance3DBoxes), \
+ 'input bbox should be CameraInstance3DBoxes!'
+
+ loc = cam_box.gravity_center
+ dim = cam_box.dims
+ yaw = cam_box.yaw
+ feats = cam_box.tensor[:, 7:]
+ # rotate along x-axis for np.pi / 2
+ # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557 # noqa
+ dim[:, [1, 2]] = dim[:, [2, 1]]
+ # change local yaw to global yaw for visualization
+ # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166 # noqa
+ yaw += torch.atan2(loc[:, 0], loc[:, 2])
+ # convert yaw by (-yaw - np.pi / 2)
+ # this is because mono 3D box class such as `NuScenesBox` has different
+ # definition of rotation with our `CameraInstance3DBoxes`
+ yaw = -yaw - np.pi / 2
+ cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
+ cam_box = CameraInstance3DBoxes(
+ cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))
+
+ return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta, coord_type):
+ """Obtain image features using points.
+
+ Args:
+ img_meta (dict): Meta info.
+ coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+ Can be case-insensitive.
+
+ Returns:
+ torch.Tensor: transformation matrix.
+ """
+ coord_type = coord_type.upper()
+ mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
+ assert coord_type in mapping.keys()
+ return img_meta[mapping[coord_type]]
diff --git a/mmcv/core/bbox/transforms.py b/mmcv/core/bbox/transforms.py
new file mode 100644
index 0000000..2dcd769
--- /dev/null
+++ b/mmcv/core/bbox/transforms.py
@@ -0,0 +1,320 @@
+import numpy as np
+import torch
+
+
+def bbox_flip(bboxes, img_shape, direction='horizontal'):
+ """Flip bboxes horizontally or vertically.
+
+ Args:
+ bboxes (Tensor): Shape (..., 4*k)
+ img_shape (tuple): Image shape.
+ direction (str): Flip direction, options are "horizontal", "vertical",
+ "diagonal". Default: "horizontal"
+
+ Returns:
+ Tensor: Flipped bboxes.
+ """
+ assert bboxes.shape[-1] % 4 == 0
+ assert direction in ['horizontal', 'vertical', 'diagonal']
+ flipped = bboxes.clone()
+ if direction == 'horizontal':
+ flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+ flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+ elif direction == 'vertical':
+ flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+ flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+ else:
+ flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+ flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+ flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+ flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+ return flipped
+
+
+def bbox_mapping(bboxes,
+ img_shape,
+ scale_factor,
+ flip,
+ flip_direction='horizontal'):
+ """Map bboxes from the original image scale to testing scale."""
+ new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+ if flip:
+ new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+ return new_bboxes
+
+
+def bbox_mapping_back(bboxes,
+ img_shape,
+ scale_factor,
+ flip,
+ flip_direction='horizontal'):
+ """Map bboxes from testing scale to original image scale."""
+ new_bboxes = bbox_flip(bboxes, img_shape,
+ flip_direction) if flip else bboxes
+ new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+ return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list):
+ """Convert a list of bboxes to roi format.
+
+ Args:
+ bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+ of images.
+
+ Returns:
+ Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+ """
+ rois_list = []
+ for img_id, bboxes in enumerate(bbox_list):
+ if bboxes.size(0) > 0:
+ img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+ rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+ else:
+ rois = bboxes.new_zeros((0, 5))
+ rois_list.append(rois)
+ rois = torch.cat(rois_list, 0)
+ return rois
+
+
+def roi2bbox(rois):
+ """Convert rois to bounding box format.
+
+ Args:
+ rois (torch.Tensor): RoIs with the shape (n, 5) where the first
+ column indicates batch id of each RoI.
+
+ Returns:
+ list[torch.Tensor]: Converted boxes of corresponding rois.
+ """
+ bbox_list = []
+ img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+ for img_id in img_ids:
+ inds = (rois[:, 0] == img_id.item())
+ bbox = rois[inds, 1:]
+ bbox_list.append(bbox)
+ return bbox_list
+
+
+def bbox2result(bboxes, labels, num_classes):
+ """Convert detection results to a list of numpy arrays.
+
+ Args:
+ bboxes (torch.Tensor | np.ndarray): shape (n, 5)
+ labels (torch.Tensor | np.ndarray): shape (n, )
+ num_classes (int): class number, including background class
+
+ Returns:
+ list(ndarray): bbox results of each class
+ """
+ if bboxes.shape[0] == 0:
+ return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+ else:
+ if isinstance(bboxes, torch.Tensor):
+ bboxes = bboxes.detach().cpu().numpy()
+ labels = labels.detach().cpu().numpy()
+ return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(points, distance, max_shape=None):
+ """Decode distance prediction to bounding box.
+
+ Args:
+ points (Tensor): Shape (B, N, 2) or (N, 2).
+ distance (Tensor): Distance from the given point to 4
+ boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+ max_shape (Sequence[int] or torch.Tensor or Sequence[
+ Sequence[int]],optional): Maximum bounds for boxes, specifies
+ (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+ the max_shape should be a Sequence[Sequence[int]]
+ and the length of max_shape should also be B.
+
+ Returns:
+ Tensor: Boxes with shape (N, 4) or (B, N, 4)
+ """
+ x1 = points[..., 0] - distance[..., 0]
+ y1 = points[..., 1] - distance[..., 1]
+ x2 = points[..., 0] + distance[..., 2]
+ y2 = points[..., 1] + distance[..., 3]
+
+ bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+ if max_shape is not None:
+ # clip bboxes with dynamic `min` and `max` for onnx
+ if torch.onnx.is_in_onnx_export():
+ from mmcv.core.export import dynamic_clip_for_onnx
+ x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+ bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+ return bboxes
+ if not isinstance(max_shape, torch.Tensor):
+ max_shape = x1.new_tensor(max_shape)
+ max_shape = max_shape[..., :2].type_as(x1)
+ if max_shape.ndim == 2:
+ assert bboxes.ndim == 3
+ assert max_shape.size(0) == bboxes.size(0)
+
+ min_xy = x1.new_tensor(0)
+ max_xy = torch.cat([max_shape, max_shape],
+ dim=-1).flip(-1).unsqueeze(-2)
+ bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+ bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+ return bboxes
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+ """Decode bounding box based on distances.
+
+ Args:
+ points (Tensor): Shape (n, 2), [x, y].
+ bbox (Tensor): Shape (n, 4), "xyxy" format
+ max_dis (float): Upper bound of the distance.
+ eps (float): a small value to ensure target < max_dis, instead <=
+
+ Returns:
+ Tensor: Decoded distances.
+ """
+ left = points[:, 0] - bbox[:, 0]
+ top = points[:, 1] - bbox[:, 1]
+ right = bbox[:, 2] - points[:, 0]
+ bottom = bbox[:, 3] - points[:, 1]
+ if max_dis is not None:
+ left = left.clamp(min=0, max=max_dis - eps)
+ top = top.clamp(min=0, max=max_dis - eps)
+ right = right.clamp(min=0, max=max_dis - eps)
+ bottom = bottom.clamp(min=0, max=max_dis - eps)
+ return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes, scale_factor=1.0):
+ """Rescale bounding box w.r.t. scale_factor.
+
+ Args:
+ bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+ scale_factor (float): rescale factor
+
+ Returns:
+ Tensor: Rescaled bboxes.
+ """
+ if bboxes.size(1) == 5:
+ bboxes_ = bboxes[:, 1:]
+ inds_ = bboxes[:, 0]
+ else:
+ bboxes_ = bboxes
+ cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+ cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+ w = bboxes_[:, 2] - bboxes_[:, 0]
+ h = bboxes_[:, 3] - bboxes_[:, 1]
+ w = w * scale_factor
+ h = h * scale_factor
+ x1 = cx - 0.5 * w
+ x2 = cx + 0.5 * w
+ y1 = cy - 0.5 * h
+ y2 = cy + 0.5 * h
+ if bboxes.size(1) == 5:
+ rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+ else:
+ rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+ return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox):
+ """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+ Args:
+ bbox (Tensor): Shape (n, 4) for bboxes.
+
+ Returns:
+ Tensor: Converted bboxes.
+ """
+ cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+ bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+ return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox):
+ """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+ Args:
+ bbox (Tensor): Shape (n, 4) for bboxes.
+
+ Returns:
+ Tensor: Converted bboxes.
+ """
+ x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+ bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+ return torch.cat(bbox_new, dim=-1)
+
+def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
+ """Map bboxes from testing scale to original image scale.
+
+ Args:
+ bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
+ scale_factor (float): Scale factor.
+ flip_horizontal (bool): Whether to flip horizontally.
+ flip_vertical (bool): Whether to flip vertically.
+
+ Returns:
+ :obj:`BaseInstance3DBoxes`: Boxes mapped back.
+ """
+ new_bboxes = bboxes.clone()
+ if flip_horizontal:
+ new_bboxes.flip('horizontal')
+ if flip_vertical:
+ new_bboxes.flip('vertical')
+ new_bboxes.scale(1 / scale_factor)
+
+ return new_bboxes
+
+
+def bbox3d2roi(bbox_list):
+ """Convert a list of bounding boxes to roi format.
+
+ Args:
+ bbox_list (list[torch.Tensor]): A list of bounding boxes
+ corresponding to a batch of images.
+
+ Returns:
+ torch.Tensor: Region of interests in shape (n, c), where \
+ the channels are in order of [batch_ind, x, y ...].
+ """
+ rois_list = []
+ for img_id, bboxes in enumerate(bbox_list):
+ if bboxes.size(0) > 0:
+ img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+ rois = torch.cat([img_inds, bboxes], dim=-1)
+ else:
+ rois = torch.zeros_like(bboxes)
+ rois_list.append(rois)
+ rois = torch.cat(rois_list, 0)
+ return rois
+
+
+def bbox3d2result(bboxes, scores, labels, attrs=None):
+ """Convert detection results to a list of numpy arrays.
+
+ Args:
+ bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
+ labels (torch.Tensor): Labels with shape of (n, ).
+ scores (torch.Tensor): Scores with shape of (n, ).
+ attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+ Defaults to None.
+
+ Returns:
+ dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+ - boxes_3d (torch.Tensor): 3D boxes.
+ - scores (torch.Tensor): Prediction scores.
+ - labels_3d (torch.Tensor): Box labels.
+ - attrs_3d (torch.Tensor, optional): Box attributes.
+ """
+ result_dict = dict(
+ boxes_3d=bboxes.to('cpu'),
+ scores_3d=scores.cpu(),
+ labels_3d=labels.cpu())
+
+ if attrs is not None:
+ result_dict['attrs_3d'] = attrs.cpu()
+
+ return result_dict
+
diff --git a/mmcv/core/bbox/util.py b/mmcv/core/bbox/util.py
new file mode 100755
index 0000000..c54bd75
--- /dev/null
+++ b/mmcv/core/bbox/util.py
@@ -0,0 +1,53 @@
+import torch
+
+
+def normalize_bbox(bboxes, pc_range):
+
+ cx = bboxes[..., 0:1]
+ cy = bboxes[..., 1:2]
+ cz = bboxes[..., 2:3]
+ w = bboxes[..., 3:4].log()
+ l = bboxes[..., 4:5].log()
+ h = bboxes[..., 5:6].log()
+
+ rot = bboxes[..., 6:7]
+ if bboxes.size(-1) > 7:
+ vx = bboxes[..., 7:8]
+ vy = bboxes[..., 8:9]
+ normalized_bboxes = torch.cat(
+ (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
+ )
+ else:
+ normalized_bboxes = torch.cat(
+ (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
+ )
+ return normalized_bboxes
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+ # rotation
+ rot_sine = normalized_bboxes[..., 6:7]
+
+ rot_cosine = normalized_bboxes[..., 7:8]
+ rot = torch.atan2(rot_sine, rot_cosine)
+
+ # center in the bev
+ cx = normalized_bboxes[..., 0:1]
+ cy = normalized_bboxes[..., 1:2]
+ cz = normalized_bboxes[..., 4:5]
+
+ # size
+ w = normalized_bboxes[..., 2:3]
+ l = normalized_bboxes[..., 3:4]
+ h = normalized_bboxes[..., 5:6]
+
+ w = w.exp()
+ l = l.exp()
+ h = h.exp()
+ if normalized_bboxes.size(-1) > 8:
+ # velocity
+ vx = normalized_bboxes[:, 8:9]
+ vy = normalized_bboxes[:, 9:10]
+ denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+ else:
+ denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+ return denormalized_bboxes
\ No newline at end of file
diff --git a/mmcv/core/evaluation/__init__.py b/mmcv/core/evaluation/__init__.py
new file mode 100644
index 0000000..b93b087
--- /dev/null
+++ b/mmcv/core/evaluation/__init__.py
@@ -0,0 +1,13 @@
+from .indoor_eval import indoor_eval
+from .kitti_utils import kitti_eval, kitti_eval_coco_style
+from .lyft_eval import lyft_eval
+from .seg_eval import seg_eval
+from .class_names import (cityscapes_classes, coco_classes, dataset_aliases,
+ get_classes, get_palette, imagenet_det_classes,
+ imagenet_vid_classes, voc_classes)
+from .eval_hooks import DistEvalHook, EvalHook, CustomDistEvalHook
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+ print_recall_summary)
+from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou
+from .metric_motion import get_ade,get_best_preds,get_fde
\ No newline at end of file
diff --git a/mmcv/core/evaluation/bbox_overlaps.py b/mmcv/core/evaluation/bbox_overlaps.py
new file mode 100644
index 0000000..93559ea
--- /dev/null
+++ b/mmcv/core/evaluation/bbox_overlaps.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
+ """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+ Args:
+ bboxes1(ndarray): shape (n, 4)
+ bboxes2(ndarray): shape (k, 4)
+ mode(str): iou (intersection over union) or iof (intersection
+ over foreground)
+
+ Returns:
+ ious(ndarray): shape (n, k)
+ """
+
+ assert mode in ['iou', 'iof']
+
+ bboxes1 = bboxes1.astype(np.float32)
+ bboxes2 = bboxes2.astype(np.float32)
+ rows = bboxes1.shape[0]
+ cols = bboxes2.shape[0]
+ ious = np.zeros((rows, cols), dtype=np.float32)
+ if rows * cols == 0:
+ return ious
+ exchange = False
+ if bboxes1.shape[0] > bboxes2.shape[0]:
+ bboxes1, bboxes2 = bboxes2, bboxes1
+ ious = np.zeros((cols, rows), dtype=np.float32)
+ exchange = True
+ area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+ area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+ for i in range(bboxes1.shape[0]):
+ x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+ y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+ x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+ y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+ overlap = np.maximum(x_end - x_start, 0) * np.maximum(
+ y_end - y_start, 0)
+ if mode == 'iou':
+ union = area1[i] + area2 - overlap
+ else:
+ union = area1[i] if not exchange else area2
+ union = np.maximum(union, eps)
+ ious[i, :] = overlap / union
+ if exchange:
+ ious = ious.T
+ return ious
diff --git a/mmcv/core/evaluation/class_names.py b/mmcv/core/evaluation/class_names.py
new file mode 100644
index 0000000..0e0e4f2
--- /dev/null
+++ b/mmcv/core/evaluation/class_names.py
@@ -0,0 +1,219 @@
+from mmcv.utils import is_str
+
+def ade_classes():
+ """ADE20K class names for external use."""
+ return [
+ 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
+ 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
+ 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
+ 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
+ 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
+ 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+ 'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+ 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
+ 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
+ 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
+ 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
+ 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+ 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+ 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
+ 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
+ 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
+ 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
+ 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
+ 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
+ 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
+ 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
+ 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
+ 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+ 'clock', 'flag'
+ ]
+
+def ade_palette():
+ """ADE20K palette for external use."""
+ return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+ [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+ [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+ [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+ [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+ [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+ [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+ [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+ [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+ [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+ [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+ [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+ [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+ [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+ [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+ [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+ [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+ [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+ [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+ [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+ [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+ [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+ [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+ [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+ [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+ [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+ [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+ [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+ [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+ [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+ [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+ [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+ [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+ [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+ [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+ [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+ [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+ [102, 255, 0], [92, 0, 255]]
+
+def wider_face_classes():
+ return ['face']
+
+
+def voc_classes():
+ return [
+ 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+ 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+ 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+ ]
+
+def voc_palette():
+ """Pascal VOC palette for external use."""
+ return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+ [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+ [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+ [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+ [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+
+
+def imagenet_det_classes():
+ return [
+ 'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+ 'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+ 'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+ 'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+ 'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+ 'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+ 'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+ 'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+ 'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+ 'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+ 'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+ 'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+ 'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+ 'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+ 'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+ 'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+ 'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+ 'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+ 'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+ 'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+ 'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+ 'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+ 'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+ 'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+ 'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+ 'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+ 'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+ 'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+ 'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+ 'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+ 'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+ 'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+ 'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+ 'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+ 'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+ 'whale', 'wine_bottle', 'zebra'
+ ]
+
+
+def imagenet_vid_classes():
+ return [
+ 'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+ 'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+ 'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+ 'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+ 'watercraft', 'whale', 'zebra'
+ ]
+
+
+def coco_classes():
+ return [
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+ 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+ 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+ 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+ 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+ 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+ 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+ 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+ 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+ 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+ 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+ 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+ 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+ ]
+
+
+def cityscapes_classes():
+ return [
+ 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+ 'bicycle'
+ ]
+
+def cityscapes_palette():
+ """Cityscapes palette for external use."""
+ return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+ [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+ [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+ [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+ [0, 0, 230], [119, 11, 32]]
+
+
+dataset_aliases = {
+ 'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+ 'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+ 'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+ 'coco': ['coco', 'mscoco', 'ms_coco'],
+ 'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
+ 'cityscapes': ['cityscapes'],
+ 'ade': ['ade', 'ade20k']
+}
+
+
+def get_classes(dataset):
+ """Get class names of a dataset."""
+ alias2name = {}
+ for name, aliases in dataset_aliases.items():
+ for alias in aliases:
+ alias2name[alias] = name
+
+ if is_str(dataset):
+ if dataset in alias2name:
+ labels = eval(alias2name[dataset] + '_classes()')
+ else:
+ raise ValueError(f'Unrecognized dataset: {dataset}')
+ else:
+ raise TypeError(f'dataset must a str, but got {type(dataset)}')
+ return labels
+
+
+def get_palette(dataset):
+ """Get class palette (RGB) of a dataset."""
+ alias2name = {}
+ for name, aliases in dataset_aliases.items():
+ for alias in aliases:
+ alias2name[alias] = name
+
+ if is_str(dataset):
+ if dataset in alias2name:
+ labels = eval(alias2name[dataset] + '_palette()')
+ else:
+ raise ValueError(f'Unrecognized dataset: {dataset}')
+ else:
+ raise TypeError(f'dataset must a str, but got {type(dataset)}')
+ return labels
diff --git a/mmcv/core/evaluation/eval_hooks.py b/mmcv/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000..dcaf73d
--- /dev/null
+++ b/mmcv/core/evaluation/eval_hooks.py
@@ -0,0 +1,133 @@
+import bisect
+import os.path as osp
+
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from mmcv.utils import is_list_of
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EvalHook(BaseEvalHook):
+
+ def _do_evaluate(self, runner):
+ """perform evaluation and save ckpt."""
+ if not self._should_evaluate(runner):
+ return
+
+ results = self.test_fn(runner.model, self.dataloader, show=False)
+ runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+ key_score = self.evaluate(runner, results)
+ if self.save_best:
+ self._save_ckpt(runner, key_score)
+
+
+class DistEvalHook(BaseDistEvalHook):
+
+ def _do_evaluate(self, runner):
+ """perform evaluation and save ckpt."""
+ # Synchronization of BatchNorm's buffer (running_mean
+ # and running_var) is not supported in the DDP of pytorch,
+ # which may cause the inconsistent performance of models in
+ # different ranks, so we broadcast BatchNorm's buffers
+ # of rank 0 to other ranks to avoid this.
+ if self.broadcast_bn_buffer:
+ model = runner.model
+ for name, module in model.named_modules():
+ if isinstance(module,
+ _BatchNorm) and module.track_running_stats:
+ dist.broadcast(module.running_var, 0)
+ dist.broadcast(module.running_mean, 0)
+
+ if not self._should_evaluate(runner):
+ return
+
+ tmpdir = self.tmpdir
+ if tmpdir is None:
+ tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+ results = self.test_fn(
+ runner.model,
+ self.dataloader,
+ tmpdir=tmpdir,
+ gpu_collect=self.gpu_collect)
+ if runner.rank == 0:
+ print('\n')
+ runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+ key_score = self.evaluate(runner, results)
+
+ if self.save_best:
+ self._save_ckpt(runner, key_score)
+
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+ assert is_list_of(dynamic_interval_list, tuple)
+
+ dynamic_milestones = [0]
+ dynamic_milestones.extend(
+ [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+ dynamic_intervals = [start_interval]
+ dynamic_intervals.extend(
+ [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+ return dynamic_milestones, dynamic_intervals
+
+
+class CustomDistEvalHook(BaseDistEvalHook):
+
+ def __init__(self, *args, dynamic_intervals=None, **kwargs):
+ super(CustomDistEvalHook, self).__init__(*args, **kwargs)
+ self.use_dynamic_intervals = dynamic_intervals is not None
+ if self.use_dynamic_intervals:
+ self.dynamic_milestones, self.dynamic_intervals = \
+ _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+ def _decide_interval(self, runner):
+ if self.use_dynamic_intervals:
+ progress = runner.epoch if self.by_epoch else runner.iter
+ step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+ # Dynamically modify the evaluation interval
+ self.interval = self.dynamic_intervals[step - 1]
+
+ def before_train_epoch(self, runner):
+ """Evaluate the model only at the start of training by epoch."""
+ self._decide_interval(runner)
+ super().before_train_epoch(runner)
+
+ def before_train_iter(self, runner):
+ self._decide_interval(runner)
+ super().before_train_iter(runner)
+
+ def _do_evaluate(self, runner):
+ """perform evaluation and save ckpt."""
+ # Synchronization of BatchNorm's buffer (running_mean
+ # and running_var) is not supported in the DDP of pytorch,
+ # which may cause the inconsistent performance of models in
+ # different ranks, so we broadcast BatchNorm's buffers
+ # of rank 0 to other ranks to avoid this.
+ if self.broadcast_bn_buffer:
+ model = runner.model
+ for name, module in model.named_modules():
+ if isinstance(module,
+ _BatchNorm) and module.track_running_stats:
+ dist.broadcast(module.running_var, 0)
+ dist.broadcast(module.running_mean, 0)
+
+ if not self._should_evaluate(runner):
+ return
+
+ tmpdir = self.tmpdir
+ if tmpdir is None:
+ tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+ results = self.test_fn(
+ runner.model,
+ self.dataloader,
+ tmpdir=tmpdir,
+ gpu_collect=self.gpu_collect)
+ if runner.rank == 0:
+ print('\n')
+ runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+
+ key_score = self.evaluate(runner, results)
+
+ if self.save_best:
+ self._save_ckpt(runner, key_score)
diff --git a/mmcv/core/evaluation/indoor_eval.py b/mmcv/core/evaluation/indoor_eval.py
new file mode 100644
index 0000000..ff0dac1
--- /dev/null
+++ b/mmcv/core/evaluation/indoor_eval.py
@@ -0,0 +1,310 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def average_precision(recalls, precisions, mode='area'):
+ """Calculate average precision (for single or multiple scales).
+
+ Args:
+ recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) \
+ or (num_dets, ).
+ precisions (np.ndarray): Precisions with shape of \
+ (num_scales, num_dets) or (num_dets, ).
+ mode (str): 'area' or '11points', 'area' means calculating the area
+ under precision-recall curve, '11points' means calculating
+ the average precision of recalls at [0, 0.1, ..., 1]
+
+ Returns:
+ float or np.ndarray: Calculated average precision.
+ """
+ if recalls.ndim == 1:
+ recalls = recalls[np.newaxis, :]
+ precisions = precisions[np.newaxis, :]
+
+ assert recalls.shape == precisions.shape
+ assert recalls.ndim == 2
+
+ num_scales = recalls.shape[0]
+ ap = np.zeros(num_scales, dtype=np.float32)
+ if mode == 'area':
+ zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+ ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+ mrec = np.hstack((zeros, recalls, ones))
+ mpre = np.hstack((zeros, precisions, zeros))
+ for i in range(mpre.shape[1] - 1, 0, -1):
+ mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+ for i in range(num_scales):
+ ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+ ap[i] = np.sum(
+ (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+ elif mode == '11points':
+ for i in range(num_scales):
+ for thr in np.arange(0, 1 + 1e-3, 0.1):
+ precs = precisions[i, recalls[i, :] >= thr]
+ prec = precs.max() if precs.size > 0 else 0
+ ap[i] += prec
+ ap /= 11
+ else:
+ raise ValueError(
+ 'Unrecognized mode, only "area" and "11points" are supported')
+ return ap
+
+
+def eval_det_cls(pred, gt, iou_thr=None):
+ """Generic functions to compute precision/recall for object detection for a
+ single class.
+
+ Args:
+ pred (dict): Predictions mapping from image id to bounding boxes \
+ and scores.
+ gt (dict): Ground truths mapping from image id to bounding boxes.
+ iou_thr (list[float]): A list of iou thresholds.
+
+ Return:
+ tuple (np.ndarray, np.ndarray, float): Recalls, precisions and \
+ average precision.
+ """
+
+ # {img_id: {'bbox': box structure, 'det': matched list}}
+ class_recs = {}
+ npos = 0
+ for img_id in gt.keys():
+ cur_gt_num = len(gt[img_id])
+ if cur_gt_num != 0:
+ gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
+ for i in range(cur_gt_num):
+ gt_cur[i] = gt[img_id][i].tensor
+ bbox = gt[img_id][0].new_box(gt_cur)
+ else:
+ bbox = gt[img_id]
+ det = [[False] * len(bbox) for i in iou_thr]
+ npos += len(bbox)
+ class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+ # construct dets
+ image_ids = []
+ confidence = []
+ ious = []
+ for img_id in pred.keys():
+ cur_num = len(pred[img_id])
+ if cur_num == 0:
+ continue
+ pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
+ box_idx = 0
+ for box, score in pred[img_id]:
+ image_ids.append(img_id)
+ confidence.append(score)
+ pred_cur[box_idx] = box.tensor
+ box_idx += 1
+ pred_cur = box.new_box(pred_cur)
+ gt_cur = class_recs[img_id]['bbox']
+ if len(gt_cur) > 0:
+ # calculate iou in each image
+ iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
+ for i in range(cur_num):
+ ious.append(iou_cur[i])
+ else:
+ for i in range(cur_num):
+ ious.append(np.zeros(1))
+
+ confidence = np.array(confidence)
+
+ # sort by confidence
+ sorted_ind = np.argsort(-confidence)
+ image_ids = [image_ids[x] for x in sorted_ind]
+ ious = [ious[x] for x in sorted_ind]
+
+ # go down dets and mark TPs and FPs
+ nd = len(image_ids)
+ tp_thr = [np.zeros(nd) for i in iou_thr]
+ fp_thr = [np.zeros(nd) for i in iou_thr]
+ for d in range(nd):
+ R = class_recs[image_ids[d]]
+ iou_max = -np.inf
+ BBGT = R['bbox']
+ cur_iou = ious[d]
+
+ if len(BBGT) > 0:
+ # compute overlaps
+ for j in range(len(BBGT)):
+ # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
+ iou = cur_iou[j]
+ if iou > iou_max:
+ iou_max = iou
+ jmax = j
+
+ for iou_idx, thresh in enumerate(iou_thr):
+ if iou_max > thresh:
+ if not R['det'][iou_idx][jmax]:
+ tp_thr[iou_idx][d] = 1.
+ R['det'][iou_idx][jmax] = 1
+ else:
+ fp_thr[iou_idx][d] = 1.
+ else:
+ fp_thr[iou_idx][d] = 1.
+
+ ret = []
+ for iou_idx, thresh in enumerate(iou_thr):
+ # compute precision recall
+ fp = np.cumsum(fp_thr[iou_idx])
+ tp = np.cumsum(tp_thr[iou_idx])
+ recall = tp / float(npos)
+ # avoid divide by zero in case the first detection matches a difficult
+ # ground truth
+ precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+ ap = average_precision(recall, precision)
+ ret.append((recall, precision, ap))
+
+ return ret
+
+
+def eval_map_recall(pred, gt, ovthresh=None):
+ """Evaluate mAP and recall.
+
+ Generic functions to compute precision/recall for object detection
+ for multiple classes.
+
+ Args:
+ pred (dict): Information of detection results,
+ which maps class_id and predictions.
+ gt (dict): Information of ground truths, which maps class_id and \
+ ground truths.
+ ovthresh (list[float]): iou threshold.
+ Default: None.
+
+ Return:
+ tuple[dict]: dict results of recall, AP, and precision for all classes.
+ """
+
+ ret_values = {}
+ for classname in gt.keys():
+ if classname in pred:
+ ret_values[classname] = eval_det_cls(pred[classname],
+ gt[classname], ovthresh)
+ recall = [{} for i in ovthresh]
+ precision = [{} for i in ovthresh]
+ ap = [{} for i in ovthresh]
+
+ for label in gt.keys():
+ for iou_idx, thresh in enumerate(ovthresh):
+ if label in pred:
+ recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
+ label] = ret_values[label][iou_idx]
+ else:
+ recall[iou_idx][label] = np.zeros(1)
+ precision[iou_idx][label] = np.zeros(1)
+ ap[iou_idx][label] = np.zeros(1)
+
+ return recall, precision, ap
+
+
+def indoor_eval(gt_annos,
+ dt_annos,
+ metric,
+ label2cat,
+ logger=None,
+ box_type_3d=None,
+ box_mode_3d=None):
+ """Indoor Evaluation.
+
+ Evaluate the result of the detection.
+
+ Args:
+ gt_annos (list[dict]): Ground truth annotations.
+ dt_annos (list[dict]): Detection annotations. the dict
+ includes the following keys
+
+ - labels_3d (torch.Tensor): Labels of boxes.
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): \
+ 3D bounding boxes in Depth coordinate.
+ - scores_3d (torch.Tensor): Scores of boxes.
+ metric (list[float]): IoU thresholds for computing average precisions.
+ label2cat (dict): Map from label to category.
+ logger (logging.Logger | str | None): The way to print the mAP
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+
+ Return:
+ dict[str, float]: Dict of results.
+ """
+ assert len(dt_annos) == len(gt_annos)
+ pred = {} # map {class_id: pred}
+ gt = {} # map {class_id: gt}
+ for img_id in range(len(dt_annos)):
+ # parse detected annotations
+ det_anno = dt_annos[img_id]
+ for i in range(len(det_anno['labels_3d'])):
+ label = det_anno['labels_3d'].numpy()[i]
+ bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
+ score = det_anno['scores_3d'].numpy()[i]
+ if label not in pred:
+ pred[int(label)] = {}
+ if img_id not in pred[label]:
+ pred[int(label)][img_id] = []
+ if label not in gt:
+ gt[int(label)] = {}
+ if img_id not in gt[label]:
+ gt[int(label)][img_id] = []
+ pred[int(label)][img_id].append((bbox, score))
+
+ # parse gt annotations
+ gt_anno = gt_annos[img_id]
+ if gt_anno['gt_num'] != 0:
+ gt_boxes = box_type_3d(
+ gt_anno['gt_boxes_upright_depth'],
+ box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
+ labels_3d = gt_anno['class']
+ else:
+ gt_boxes = box_type_3d(np.array([], dtype=np.float32))
+ labels_3d = np.array([], dtype=np.int64)
+
+ for i in range(len(labels_3d)):
+ label = labels_3d[i]
+ bbox = gt_boxes[i]
+ if label not in gt:
+ gt[label] = {}
+ if img_id not in gt[label]:
+ gt[label][img_id] = []
+ gt[label][img_id].append(bbox)
+
+ rec, prec, ap = eval_map_recall(pred, gt, metric)
+ ret_dict = dict()
+ header = ['classes']
+ table_columns = [[label2cat[label]
+ for label in ap[0].keys()] + ['Overall']]
+
+ for i, iou_thresh in enumerate(metric):
+ header.append(f'AP_{iou_thresh:.2f}')
+ header.append(f'AR_{iou_thresh:.2f}')
+ rec_list = []
+ for label in ap[i].keys():
+ ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
+ ap[i][label][0])
+ ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
+ np.mean(list(ap[i].values())))
+
+ table_columns.append(list(map(float, list(ap[i].values()))))
+ table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
+ table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+ for label in rec[i].keys():
+ ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
+ rec[i][label][-1])
+ rec_list.append(rec[i][label][-1])
+ ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
+
+ table_columns.append(list(map(float, rec_list)))
+ table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
+ table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+ table_data = [header]
+ table_rows = list(zip(*table_columns))
+ table_data += table_rows
+ table = AsciiTable(table_data)
+ table.inner_footing_row_border = True
+ print_log('\n' + table.table, logger=logger)
+
+ return ret_dict
diff --git a/mmcv/core/evaluation/kitti_utils/__init__.py b/mmcv/core/evaluation/kitti_utils/__init__.py
new file mode 100644
index 0000000..23c1cdf
--- /dev/null
+++ b/mmcv/core/evaluation/kitti_utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .eval import kitti_eval, kitti_eval_coco_style
+
+__all__ = ['kitti_eval', 'kitti_eval_coco_style']
diff --git a/mmcv/core/evaluation/kitti_utils/eval.py b/mmcv/core/evaluation/kitti_utils/eval.py
new file mode 100644
index 0000000..93492c4
--- /dev/null
+++ b/mmcv/core/evaluation/kitti_utils/eval.py
@@ -0,0 +1,847 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gc
+import io as sysio
+import numba
+import numpy as np
+
+
+@numba.jit
+def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
+ scores.sort()
+ scores = scores[::-1]
+ current_recall = 0
+ thresholds = []
+ for i, score in enumerate(scores):
+ l_recall = (i + 1) / num_gt
+ if i < (len(scores) - 1):
+ r_recall = (i + 2) / num_gt
+ else:
+ r_recall = l_recall
+ if (((r_recall - current_recall) < (current_recall - l_recall))
+ and (i < (len(scores) - 1))):
+ continue
+ # recall = l_recall
+ thresholds.append(score)
+ current_recall += 1 / (num_sample_pts - 1.0)
+ return thresholds
+
+
+def clean_data(gt_anno, dt_anno, current_class, difficulty):
+ CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
+ MIN_HEIGHT = [40, 25, 25]
+ MAX_OCCLUSION = [0, 1, 2]
+ MAX_TRUNCATION = [0.15, 0.3, 0.5]
+ dc_bboxes, ignored_gt, ignored_dt = [], [], []
+ current_cls_name = CLASS_NAMES[current_class].lower()
+ num_gt = len(gt_anno['name'])
+ num_dt = len(dt_anno['name'])
+ num_valid_gt = 0
+ for i in range(num_gt):
+ bbox = gt_anno['bbox'][i]
+ gt_name = gt_anno['name'][i].lower()
+ height = bbox[3] - bbox[1]
+ valid_class = -1
+ if (gt_name == current_cls_name):
+ valid_class = 1
+ elif (current_cls_name == 'Pedestrian'.lower()
+ and 'Person_sitting'.lower() == gt_name):
+ valid_class = 0
+ elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
+ valid_class = 0
+ else:
+ valid_class = -1
+ ignore = False
+ if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
+ or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
+ or (height <= MIN_HEIGHT[difficulty])):
+ ignore = True
+ if valid_class == 1 and not ignore:
+ ignored_gt.append(0)
+ num_valid_gt += 1
+ elif (valid_class == 0 or (ignore and (valid_class == 1))):
+ ignored_gt.append(1)
+ else:
+ ignored_gt.append(-1)
+ # for i in range(num_gt):
+ if gt_anno['name'][i] == 'DontCare':
+ dc_bboxes.append(gt_anno['bbox'][i])
+ for i in range(num_dt):
+ if (dt_anno['name'][i].lower() == current_cls_name):
+ valid_class = 1
+ else:
+ valid_class = -1
+ height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
+ if height < MIN_HEIGHT[difficulty]:
+ ignored_dt.append(1)
+ elif valid_class == 1:
+ ignored_dt.append(0)
+ else:
+ ignored_dt.append(-1)
+
+ return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(boxes, query_boxes, criterion=-1):
+ N = boxes.shape[0]
+ K = query_boxes.shape[0]
+ overlaps = np.zeros((N, K), dtype=boxes.dtype)
+ for k in range(K):
+ qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
+ (query_boxes[k, 3] - query_boxes[k, 1]))
+ for n in range(N):
+ iw = (
+ min(boxes[n, 2], query_boxes[k, 2]) -
+ max(boxes[n, 0], query_boxes[k, 0]))
+ if iw > 0:
+ ih = (
+ min(boxes[n, 3], query_boxes[k, 3]) -
+ max(boxes[n, 1], query_boxes[k, 1]))
+ if ih > 0:
+ if criterion == -1:
+ ua = ((boxes[n, 2] - boxes[n, 0]) *
+ (boxes[n, 3] - boxes[n, 1]) + qbox_area -
+ iw * ih)
+ elif criterion == 0:
+ ua = ((boxes[n, 2] - boxes[n, 0]) *
+ (boxes[n, 3] - boxes[n, 1]))
+ elif criterion == 1:
+ ua = qbox_area
+ else:
+ ua = 1.0
+ overlaps[n, k] = iw * ih / ua
+ return overlaps
+
+
+def bev_box_overlap(boxes, qboxes, criterion=-1):
+ from .rotate_iou import rotate_iou_gpu_eval
+ riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
+ return riou
+
+
+@numba.jit(nopython=True, parallel=True)
+def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
+ # ONLY support overlap in CAMERA, not lidar.
+ # TODO: change to use prange for parallel mode, should check the difference
+ N, K = boxes.shape[0], qboxes.shape[0]
+ for i in numba.prange(N):
+ for j in numba.prange(K):
+ if rinc[i, j] > 0:
+ # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+ # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+ iw = (
+ min(boxes[i, 1], qboxes[j, 1]) -
+ max(boxes[i, 1] - boxes[i, 4],
+ qboxes[j, 1] - qboxes[j, 4]))
+
+ if iw > 0:
+ area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+ area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+ inc = iw * rinc[i, j]
+ if criterion == -1:
+ ua = (area1 + area2 - inc)
+ elif criterion == 0:
+ ua = area1
+ elif criterion == 1:
+ ua = area2
+ else:
+ ua = inc
+ rinc[i, j] = inc / ua
+ else:
+ rinc[i, j] = 0.0
+
+
+def d3_box_overlap(boxes, qboxes, criterion=-1):
+ from .rotate_iou import rotate_iou_gpu_eval
+ rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
+ qboxes[:, [0, 2, 3, 5, 6]], 2)
+ d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+ return rinc
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(overlaps,
+ gt_datas,
+ dt_datas,
+ ignored_gt,
+ ignored_det,
+ dc_bboxes,
+ metric,
+ min_overlap,
+ thresh=0,
+ compute_fp=False,
+ compute_aos=False):
+
+ det_size = dt_datas.shape[0]
+ gt_size = gt_datas.shape[0]
+ dt_scores = dt_datas[:, -1]
+ dt_alphas = dt_datas[:, 4]
+ gt_alphas = gt_datas[:, 4]
+ dt_bboxes = dt_datas[:, :4]
+ # gt_bboxes = gt_datas[:, :4]
+
+ assigned_detection = [False] * det_size
+ ignored_threshold = [False] * det_size
+ if compute_fp:
+ for i in range(det_size):
+ if (dt_scores[i] < thresh):
+ ignored_threshold[i] = True
+ NO_DETECTION = -10000000
+ tp, fp, fn, similarity = 0, 0, 0, 0
+ # thresholds = [0.0]
+ # delta = [0.0]
+ thresholds = np.zeros((gt_size, ))
+ thresh_idx = 0
+ delta = np.zeros((gt_size, ))
+ delta_idx = 0
+ for i in range(gt_size):
+ if ignored_gt[i] == -1:
+ continue
+ det_idx = -1
+ valid_detection = NO_DETECTION
+ max_overlap = 0
+ assigned_ignored_det = False
+
+ for j in range(det_size):
+ if (ignored_det[j] == -1):
+ continue
+ if (assigned_detection[j]):
+ continue
+ if (ignored_threshold[j]):
+ continue
+ overlap = overlaps[j, i]
+ dt_score = dt_scores[j]
+ if (not compute_fp and (overlap > min_overlap)
+ and dt_score > valid_detection):
+ det_idx = j
+ valid_detection = dt_score
+ elif (compute_fp and (overlap > min_overlap)
+ and (overlap > max_overlap or assigned_ignored_det)
+ and ignored_det[j] == 0):
+ max_overlap = overlap
+ det_idx = j
+ valid_detection = 1
+ assigned_ignored_det = False
+ elif (compute_fp and (overlap > min_overlap)
+ and (valid_detection == NO_DETECTION)
+ and ignored_det[j] == 1):
+ det_idx = j
+ valid_detection = 1
+ assigned_ignored_det = True
+
+ if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+ fn += 1
+ elif ((valid_detection != NO_DETECTION)
+ and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
+ assigned_detection[det_idx] = True
+ elif valid_detection != NO_DETECTION:
+ tp += 1
+ # thresholds.append(dt_scores[det_idx])
+ thresholds[thresh_idx] = dt_scores[det_idx]
+ thresh_idx += 1
+ if compute_aos:
+ # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+ delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+ delta_idx += 1
+
+ assigned_detection[det_idx] = True
+ if compute_fp:
+ for i in range(det_size):
+ if (not (assigned_detection[i] or ignored_det[i] == -1
+ or ignored_det[i] == 1 or ignored_threshold[i])):
+ fp += 1
+ nstuff = 0
+ if metric == 0:
+ overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+ for i in range(dc_bboxes.shape[0]):
+ for j in range(det_size):
+ if (assigned_detection[j]):
+ continue
+ if (ignored_det[j] == -1 or ignored_det[j] == 1):
+ continue
+ if (ignored_threshold[j]):
+ continue
+ if overlaps_dt_dc[j, i] > min_overlap:
+ assigned_detection[j] = True
+ nstuff += 1
+ fp -= nstuff
+ if compute_aos:
+ tmp = np.zeros((fp + delta_idx, ))
+ # tmp = [0] * fp
+ for i in range(delta_idx):
+ tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+ # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+ # assert len(tmp) == fp + tp
+ # assert len(delta) == tp
+ if tp > 0 or fp > 0:
+ similarity = np.sum(tmp)
+ else:
+ similarity = -1
+ return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+def get_split_parts(num, num_part):
+ same_part = num // num_part
+ remain_num = num % num_part
+ if remain_num == 0:
+ return [same_part] * num_part
+ else:
+ return [same_part] * num_part + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(overlaps,
+ pr,
+ gt_nums,
+ dt_nums,
+ dc_nums,
+ gt_datas,
+ dt_datas,
+ dontcares,
+ ignored_gts,
+ ignored_dets,
+ metric,
+ min_overlap,
+ thresholds,
+ compute_aos=False):
+ gt_num = 0
+ dt_num = 0
+ dc_num = 0
+ for i in range(gt_nums.shape[0]):
+ for t, thresh in enumerate(thresholds):
+ overlap = overlaps[dt_num:dt_num + dt_nums[i],
+ gt_num:gt_num + gt_nums[i]]
+
+ gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
+ dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
+ ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
+ ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
+ dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
+ tp, fp, fn, similarity, _ = compute_statistics_jit(
+ overlap,
+ gt_data,
+ dt_data,
+ ignored_gt,
+ ignored_det,
+ dontcare,
+ metric,
+ min_overlap=min_overlap,
+ thresh=thresh,
+ compute_fp=True,
+ compute_aos=compute_aos)
+ pr[t, 0] += tp
+ pr[t, 1] += fp
+ pr[t, 2] += fn
+ if similarity != -1:
+ pr[t, 3] += similarity
+ gt_num += gt_nums[i]
+ dt_num += dt_nums[i]
+ dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
+ """Fast iou algorithm. this function can be used independently to do result
+ analysis. Must be used in CAMERA coordinate system.
+
+ Args:
+ gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+ dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+ metric (int): Eval type. 0: bbox, 1: bev, 2: 3d.
+ num_parts (int): A parameter for fast calculate algorithm.
+ """
+ assert len(gt_annos) == len(dt_annos)
+ total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
+ total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
+ num_examples = len(gt_annos)
+ split_parts = get_split_parts(num_examples, num_parts)
+ parted_overlaps = []
+ example_idx = 0
+
+ for num_part in split_parts:
+ gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+ dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+ if metric == 0:
+ gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
+ dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
+ overlap_part = image_box_overlap(gt_boxes, dt_boxes)
+ elif metric == 1:
+ loc = np.concatenate(
+ [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
+ dims = np.concatenate(
+ [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
+ rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+ gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+ axis=1)
+ loc = np.concatenate(
+ [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
+ dims = np.concatenate(
+ [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
+ rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+ dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+ axis=1)
+ overlap_part = bev_box_overlap(gt_boxes,
+ dt_boxes).astype(np.float64)
+ elif metric == 2:
+ loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
+ dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
+ rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+ gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+ axis=1)
+ loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
+ dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
+ rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+ dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+ axis=1)
+ overlap_part = d3_box_overlap(gt_boxes,
+ dt_boxes).astype(np.float64)
+ else:
+ raise ValueError('unknown metric')
+ parted_overlaps.append(overlap_part)
+ example_idx += num_part
+ overlaps = []
+ example_idx = 0
+ for j, num_part in enumerate(split_parts):
+ gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+ dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+ gt_num_idx, dt_num_idx = 0, 0
+ for i in range(num_part):
+ gt_box_num = total_gt_num[example_idx + i]
+ dt_box_num = total_dt_num[example_idx + i]
+ overlaps.append(
+ parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
+ dt_num_idx:dt_num_idx + dt_box_num])
+ gt_num_idx += gt_box_num
+ dt_num_idx += dt_box_num
+ example_idx += num_part
+
+ return overlaps, parted_overlaps, total_gt_num, total_dt_num
+
+
+def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
+ gt_datas_list = []
+ dt_datas_list = []
+ total_dc_num = []
+ ignored_gts, ignored_dets, dontcares = [], [], []
+ total_num_valid_gt = 0
+ for i in range(len(gt_annos)):
+ rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+ num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+ ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+ ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+ if len(dc_bboxes) == 0:
+ dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+ else:
+ dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+ total_dc_num.append(dc_bboxes.shape[0])
+ dontcares.append(dc_bboxes)
+ total_num_valid_gt += num_valid_gt
+ gt_datas = np.concatenate(
+ [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
+ dt_datas = np.concatenate([
+ dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
+ dt_annos[i]['score'][..., np.newaxis]
+ ], 1)
+ gt_datas_list.append(gt_datas)
+ dt_datas_list.append(dt_datas)
+ total_dc_num = np.stack(total_dc_num, axis=0)
+ return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
+ total_dc_num, total_num_valid_gt)
+
+
+def eval_class(gt_annos,
+ dt_annos,
+ current_classes,
+ difficultys,
+ metric,
+ min_overlaps,
+ compute_aos=False,
+ num_parts=200):
+ """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+
+ Args:
+ gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+ dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+ current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist.
+ difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard
+ metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
+ min_overlaps (float): Min overlap. format:
+ [num_overlap, metric, class].
+ num_parts (int): A parameter for fast calculate algorithm
+
+ Returns:
+ dict[str, np.ndarray]: recall, precision and aos
+ """
+ assert len(gt_annos) == len(dt_annos)
+ num_examples = len(gt_annos)
+ if num_examples < num_parts:
+ num_parts = num_examples
+ split_parts = get_split_parts(num_examples, num_parts)
+
+ rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+ overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+ N_SAMPLE_PTS = 41
+ num_minoverlap = len(min_overlaps)
+ num_class = len(current_classes)
+ num_difficulty = len(difficultys)
+ precision = np.zeros(
+ [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+ recall = np.zeros(
+ [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+ aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+ for m, current_class in enumerate(current_classes):
+ for idx_l, difficulty in enumerate(difficultys):
+ rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+ (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+ dontcares, total_dc_num, total_num_valid_gt) = rets
+ for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+ thresholdss = []
+ for i in range(len(gt_annos)):
+ rets = compute_statistics_jit(
+ overlaps[i],
+ gt_datas_list[i],
+ dt_datas_list[i],
+ ignored_gts[i],
+ ignored_dets[i],
+ dontcares[i],
+ metric,
+ min_overlap=min_overlap,
+ thresh=0.0,
+ compute_fp=False)
+ tp, fp, fn, similarity, thresholds = rets
+ thresholdss += thresholds.tolist()
+ thresholdss = np.array(thresholdss)
+ thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+ thresholds = np.array(thresholds)
+ pr = np.zeros([len(thresholds), 4])
+ idx = 0
+ for j, num_part in enumerate(split_parts):
+ gt_datas_part = np.concatenate(
+ gt_datas_list[idx:idx + num_part], 0)
+ dt_datas_part = np.concatenate(
+ dt_datas_list[idx:idx + num_part], 0)
+ dc_datas_part = np.concatenate(
+ dontcares[idx:idx + num_part], 0)
+ ignored_dets_part = np.concatenate(
+ ignored_dets[idx:idx + num_part], 0)
+ ignored_gts_part = np.concatenate(
+ ignored_gts[idx:idx + num_part], 0)
+ fused_compute_statistics(
+ parted_overlaps[j],
+ pr,
+ total_gt_num[idx:idx + num_part],
+ total_dt_num[idx:idx + num_part],
+ total_dc_num[idx:idx + num_part],
+ gt_datas_part,
+ dt_datas_part,
+ dc_datas_part,
+ ignored_gts_part,
+ ignored_dets_part,
+ metric,
+ min_overlap=min_overlap,
+ thresholds=thresholds,
+ compute_aos=compute_aos)
+ idx += num_part
+ for i in range(len(thresholds)):
+ recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+ precision[m, idx_l, k, i] = pr[i, 0] / (
+ pr[i, 0] + pr[i, 1])
+ if compute_aos:
+ aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+ for i in range(len(thresholds)):
+ precision[m, idx_l, k, i] = np.max(
+ precision[m, idx_l, k, i:], axis=-1)
+ recall[m, idx_l, k, i] = np.max(
+ recall[m, idx_l, k, i:], axis=-1)
+ if compute_aos:
+ aos[m, idx_l, k, i] = np.max(
+ aos[m, idx_l, k, i:], axis=-1)
+ ret_dict = {
+ 'recall': recall,
+ 'precision': precision,
+ 'orientation': aos,
+ }
+
+ # clean temp variables
+ del overlaps
+ del parted_overlaps
+
+ gc.collect()
+ return ret_dict
+
+
+def get_mAP(prec):
+ sums = 0
+ for i in range(0, prec.shape[-1], 4):
+ sums = sums + prec[..., i]
+ return sums / 11 * 100
+
+
+def print_str(value, *arg, sstream=None):
+ if sstream is None:
+ sstream = sysio.StringIO()
+ sstream.truncate(0)
+ sstream.seek(0)
+ print(value, *arg, file=sstream)
+ return sstream.getvalue()
+
+
+def do_eval(gt_annos,
+ dt_annos,
+ current_classes,
+ min_overlaps,
+ eval_types=['bbox', 'bev', '3d']):
+ # min_overlaps: [num_minoverlap, metric, num_class]
+ difficultys = [0, 1, 2]
+ mAP_bbox = None
+ mAP_aos = None
+ if 'bbox' in eval_types:
+ ret = eval_class(
+ gt_annos,
+ dt_annos,
+ current_classes,
+ difficultys,
+ 0,
+ min_overlaps,
+ compute_aos=('aos' in eval_types))
+ # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+ mAP_bbox = get_mAP(ret['precision'])
+ if 'aos' in eval_types:
+ mAP_aos = get_mAP(ret['orientation'])
+
+ mAP_bev = None
+ if 'bev' in eval_types:
+ ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
+ min_overlaps)
+ mAP_bev = get_mAP(ret['precision'])
+
+ mAP_3d = None
+ if '3d' in eval_types:
+ ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
+ min_overlaps)
+ mAP_3d = get_mAP(ret['precision'])
+ return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
+ compute_aos):
+ # overlap_ranges: [range, metric, num_class]
+ min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+ for i in range(overlap_ranges.shape[1]):
+ for j in range(overlap_ranges.shape[2]):
+ min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
+ mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos,
+ current_classes, min_overlaps,
+ compute_aos)
+ # ret: [num_class, num_diff, num_minoverlap]
+ mAP_bbox = mAP_bbox.mean(-1)
+ mAP_bev = mAP_bev.mean(-1)
+ mAP_3d = mAP_3d.mean(-1)
+ if mAP_aos is not None:
+ mAP_aos = mAP_aos.mean(-1)
+ return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def kitti_eval(gt_annos,
+ dt_annos,
+ current_classes,
+ eval_types=['bbox', 'bev', '3d']):
+ """KITTI evaluation.
+
+ Args:
+ gt_annos (list[dict]): Contain gt information of each sample.
+ dt_annos (list[dict]): Contain detected information of each sample.
+ current_classes (list[str]): Classes to evaluation.
+ eval_types (list[str], optional): Types to eval.
+ Defaults to ['bbox', 'bev', '3d'].
+
+ Returns:
+ tuple: String and dict of evaluation results.
+ """
+ assert len(eval_types) > 0, 'must contain at least one evaluation type'
+ if 'aos' in eval_types:
+ assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos'
+ overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
+ 0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
+ [0.7, 0.5, 0.5, 0.7, 0.5]])
+ overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
+ [0.5, 0.25, 0.25, 0.5, 0.25],
+ [0.5, 0.25, 0.25, 0.5, 0.25]])
+ min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0) # [2, 3, 5]
+ class_to_name = {
+ 0: 'Car',
+ 1: 'Pedestrian',
+ 2: 'Cyclist',
+ 3: 'Van',
+ 4: 'Person_sitting',
+ }
+ name_to_class = {v: n for n, v in class_to_name.items()}
+ if not isinstance(current_classes, (list, tuple)):
+ current_classes = [current_classes]
+ current_classes_int = []
+ for curcls in current_classes:
+ if isinstance(curcls, str):
+ current_classes_int.append(name_to_class[curcls])
+ else:
+ current_classes_int.append(curcls)
+ current_classes = current_classes_int
+ min_overlaps = min_overlaps[:, :, current_classes]
+ result = ''
+ # check whether alpha is valid
+ compute_aos = False
+ pred_alpha = False
+ valid_alpha_gt = False
+ for anno in dt_annos:
+ mask = (anno['alpha'] != -10)
+ if anno['alpha'][mask].shape[0] != 0:
+ pred_alpha = True
+ break
+ for anno in gt_annos:
+ if anno['alpha'][0] != -10:
+ valid_alpha_gt = True
+ break
+ compute_aos = (pred_alpha and valid_alpha_gt)
+ if compute_aos:
+ eval_types.append('aos')
+
+ mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos,
+ current_classes, min_overlaps,
+ eval_types)
+
+ ret_dict = {}
+ difficulty = ['easy', 'moderate', 'hard']
+ for j, curcls in enumerate(current_classes):
+ # mAP threshold array: [num_minoverlap, metric, class]
+ # mAP result: [num_class, num_diff, num_minoverlap]
+ curcls_name = class_to_name[curcls]
+ for i in range(min_overlaps.shape[0]):
+ # prepare results for print
+ result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+ curcls_name, *min_overlaps[i, :, j]))
+ if mAPbbox is not None:
+ result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
+ *mAPbbox[j, :, i])
+ if mAPbev is not None:
+ result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
+ *mAPbev[j, :, i])
+ if mAP3d is not None:
+ result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
+ *mAP3d[j, :, i])
+
+ if compute_aos:
+ result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(
+ *mAPaos[j, :, i])
+
+ # prepare results for logger
+ for idx in range(3):
+ if i == 0:
+ postfix = f'{difficulty[idx]}_strict'
+ else:
+ postfix = f'{difficulty[idx]}_loose'
+ prefix = f'KITTI/{curcls_name}'
+ if mAP3d is not None:
+ ret_dict[f'{prefix}_3D_{postfix}'] = mAP3d[j, idx, i]
+ if mAPbev is not None:
+ ret_dict[f'{prefix}_BEV_{postfix}'] = mAPbev[j, idx, i]
+ if mAPbbox is not None:
+ ret_dict[f'{prefix}_2D_{postfix}'] = mAPbbox[j, idx, i]
+
+ # calculate mAP over all classes if there are multiple classes
+ if len(current_classes) > 1:
+ # prepare results for print
+ result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty))
+ if mAPbbox is not None:
+ mAPbbox = mAPbbox.mean(axis=0)
+ result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:, 0])
+ if mAPbev is not None:
+ mAPbev = mAPbev.mean(axis=0)
+ result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:, 0])
+ if mAP3d is not None:
+ mAP3d = mAP3d.mean(axis=0)
+ result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0])
+ if compute_aos:
+ mAPaos = mAPaos.mean(axis=0)
+ result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, 0])
+
+ # prepare results for logger
+ for idx in range(3):
+ postfix = f'{difficulty[idx]}'
+ if mAP3d is not None:
+ ret_dict[f'KITTI/Overall_3D_{postfix}'] = mAP3d[idx, 0]
+ if mAPbev is not None:
+ ret_dict[f'KITTI/Overall_BEV_{postfix}'] = mAPbev[idx, 0]
+ if mAPbbox is not None:
+ ret_dict[f'KITTI/Overall_2D_{postfix}'] = mAPbbox[idx, 0]
+
+ return result, ret_dict
+
+
+def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
+ """coco style evaluation of kitti.
+
+ Args:
+ gt_annos (list[dict]): Contain gt information of each sample.
+ dt_annos (list[dict]): Contain detected information of each sample.
+ current_classes (list[str]): Classes to evaluation.
+
+ Returns:
+ string: Evaluation results.
+ """
+ class_to_name = {
+ 0: 'Car',
+ 1: 'Pedestrian',
+ 2: 'Cyclist',
+ 3: 'Van',
+ 4: 'Person_sitting',
+ }
+ class_to_range = {
+ 0: [0.5, 0.95, 10],
+ 1: [0.25, 0.7, 10],
+ 2: [0.25, 0.7, 10],
+ 3: [0.5, 0.95, 10],
+ 4: [0.25, 0.7, 10],
+ }
+ name_to_class = {v: n for n, v in class_to_name.items()}
+ if not isinstance(current_classes, (list, tuple)):
+ current_classes = [current_classes]
+ current_classes_int = []
+ for curcls in current_classes:
+ if isinstance(curcls, str):
+ current_classes_int.append(name_to_class[curcls])
+ else:
+ current_classes_int.append(curcls)
+ current_classes = current_classes_int
+ overlap_ranges = np.zeros([3, 3, len(current_classes)])
+ for i, curcls in enumerate(current_classes):
+ overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
+ np.newaxis]
+ result = ''
+ # check whether alpha is valid
+ compute_aos = False
+ for anno in dt_annos:
+ if anno['alpha'].shape[0] != 0:
+ if anno['alpha'][0] != -10:
+ compute_aos = True
+ break
+ mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
+ gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+ for j, curcls in enumerate(current_classes):
+ # mAP threshold array: [num_minoverlap, metric, class]
+ # mAP result: [num_class, num_diff, num_minoverlap]
+ o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
+ o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+ result += print_str((f'{class_to_name[curcls]} '
+ 'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
+ result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
+ f'{mAPbbox[j, 1]:.2f}, '
+ f'{mAPbbox[j, 2]:.2f}'))
+ result += print_str((f'bev AP:{mAPbev[j, 0]:.2f}, '
+ f'{mAPbev[j, 1]:.2f}, '
+ f'{mAPbev[j, 2]:.2f}'))
+ result += print_str((f'3d AP:{mAP3d[j, 0]:.2f}, '
+ f'{mAP3d[j, 1]:.2f}, '
+ f'{mAP3d[j, 2]:.2f}'))
+ if compute_aos:
+ result += print_str((f'aos AP:{mAPaos[j, 0]:.2f}, '
+ f'{mAPaos[j, 1]:.2f}, '
+ f'{mAPaos[j, 2]:.2f}'))
+ return result
diff --git a/mmcv/core/evaluation/kitti_utils/rotate_iou.py b/mmcv/core/evaluation/kitti_utils/rotate_iou.py
new file mode 100644
index 0000000..2f0c9c8
--- /dev/null
+++ b/mmcv/core/evaluation/kitti_utils/rotate_iou.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m, n):
+ return m // n + (m % n > 0)
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def trangle_area(a, b, c):
+ return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+ (b[0] - c[0])) / 2.0
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def area(int_pts, num_of_inter):
+ area_val = 0.0
+ for i in range(num_of_inter - 2):
+ area_val += abs(
+ trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+ int_pts[2 * i + 4:2 * i + 6]))
+ return area_val
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+ if num_of_inter > 0:
+ center = cuda.local.array((2, ), dtype=numba.float32)
+ center[:] = 0.0
+ for i in range(num_of_inter):
+ center[0] += int_pts[2 * i]
+ center[1] += int_pts[2 * i + 1]
+ center[0] /= num_of_inter
+ center[1] /= num_of_inter
+ v = cuda.local.array((2, ), dtype=numba.float32)
+ vs = cuda.local.array((16, ), dtype=numba.float32)
+ for i in range(num_of_inter):
+ v[0] = int_pts[2 * i] - center[0]
+ v[1] = int_pts[2 * i + 1] - center[1]
+ d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+ v[0] = v[0] / d
+ v[1] = v[1] / d
+ if v[1] < 0:
+ v[0] = -2 - v[0]
+ vs[i] = v[0]
+ j = 0
+ temp = 0
+ for i in range(1, num_of_inter):
+ if vs[i - 1] > vs[i]:
+ temp = vs[i]
+ tx = int_pts[2 * i]
+ ty = int_pts[2 * i + 1]
+ j = i
+ while j > 0 and vs[j - 1] > temp:
+ vs[j] = vs[j - 1]
+ int_pts[j * 2] = int_pts[j * 2 - 2]
+ int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+ j -= 1
+
+ vs[j] = temp
+ int_pts[j * 2] = tx
+ int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit(
+ '(float32[:], float32[:], int32, int32, float32[:])',
+ device=True,
+ inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+ A = cuda.local.array((2, ), dtype=numba.float32)
+ B = cuda.local.array((2, ), dtype=numba.float32)
+ C = cuda.local.array((2, ), dtype=numba.float32)
+ D = cuda.local.array((2, ), dtype=numba.float32)
+
+ A[0] = pts1[2 * i]
+ A[1] = pts1[2 * i + 1]
+
+ B[0] = pts1[2 * ((i + 1) % 4)]
+ B[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+ C[0] = pts2[2 * j]
+ C[1] = pts2[2 * j + 1]
+
+ D[0] = pts2[2 * ((j + 1) % 4)]
+ D[1] = pts2[2 * ((j + 1) % 4) + 1]
+ BA0 = B[0] - A[0]
+ BA1 = B[1] - A[1]
+ DA0 = D[0] - A[0]
+ CA0 = C[0] - A[0]
+ DA1 = D[1] - A[1]
+ CA1 = C[1] - A[1]
+ acd = DA1 * CA0 > CA1 * DA0
+ bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+ if acd != bcd:
+ abc = CA1 * BA0 > BA1 * CA0
+ abd = DA1 * BA0 > BA1 * DA0
+ if abc != abd:
+ DC0 = D[0] - C[0]
+ DC1 = D[1] - C[1]
+ ABBA = A[0] * B[1] - B[0] * A[1]
+ CDDC = C[0] * D[1] - D[0] * C[1]
+ DH = BA1 * DC0 - BA0 * DC1
+ Dx = ABBA * DC0 - BA0 * CDDC
+ Dy = ABBA * DC1 - BA1 * CDDC
+ temp_pts[0] = Dx / DH
+ temp_pts[1] = Dy / DH
+ return True
+ return False
+
+
+@cuda.jit(
+ '(float32[:], float32[:], int32, int32, float32[:])',
+ device=True,
+ inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+ a = cuda.local.array((2, ), dtype=numba.float32)
+ b = cuda.local.array((2, ), dtype=numba.float32)
+ c = cuda.local.array((2, ), dtype=numba.float32)
+ d = cuda.local.array((2, ), dtype=numba.float32)
+
+ a[0] = pts1[2 * i]
+ a[1] = pts1[2 * i + 1]
+
+ b[0] = pts1[2 * ((i + 1) % 4)]
+ b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+ c[0] = pts2[2 * j]
+ c[1] = pts2[2 * j + 1]
+
+ d[0] = pts2[2 * ((j + 1) % 4)]
+ d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+ area_abc = trangle_area(a, b, c)
+ area_abd = trangle_area(a, b, d)
+
+ if area_abc * area_abd >= 0:
+ return False
+
+ area_cda = trangle_area(c, d, a)
+ area_cdb = area_cda + area_abc - area_abd
+
+ if area_cda * area_cdb >= 0:
+ return False
+ t = area_cda / (area_abd - area_abc)
+
+ dx = t * (b[0] - a[0])
+ dy = t * (b[1] - a[1])
+ temp_pts[0] = a[0] + dx
+ temp_pts[1] = a[1] + dy
+ return True
+
+
+@cuda.jit('(float32, float32, float32[:])', device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+ ab0 = corners[2] - corners[0]
+ ab1 = corners[3] - corners[1]
+
+ ad0 = corners[6] - corners[0]
+ ad1 = corners[7] - corners[1]
+
+ ap0 = pt_x - corners[0]
+ ap1 = pt_y - corners[1]
+
+ abab = ab0 * ab0 + ab1 * ab1
+ abap = ab0 * ap0 + ab1 * ap1
+ adad = ad0 * ad0 + ad1 * ad1
+ adap = ad0 * ap0 + ad1 * ap1
+
+ return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+ num_of_inter = 0
+ for i in range(4):
+ if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+ int_pts[num_of_inter * 2] = pts1[2 * i]
+ int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+ num_of_inter += 1
+ if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+ int_pts[num_of_inter * 2] = pts2[2 * i]
+ int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+ num_of_inter += 1
+ temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+ for i in range(4):
+ for j in range(4):
+ has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+ if has_pts:
+ int_pts[num_of_inter * 2] = temp_pts[0]
+ int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+ num_of_inter += 1
+
+ return num_of_inter
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+ # generate clockwise corners and rotate it clockwise
+ angle = rbbox[4]
+ a_cos = math.cos(angle)
+ a_sin = math.sin(angle)
+ center_x = rbbox[0]
+ center_y = rbbox[1]
+ x_d = rbbox[2]
+ y_d = rbbox[3]
+ corners_x = cuda.local.array((4, ), dtype=numba.float32)
+ corners_y = cuda.local.array((4, ), dtype=numba.float32)
+ corners_x[0] = -x_d / 2
+ corners_x[1] = -x_d / 2
+ corners_x[2] = x_d / 2
+ corners_x[3] = x_d / 2
+ corners_y[0] = -y_d / 2
+ corners_y[1] = y_d / 2
+ corners_y[2] = y_d / 2
+ corners_y[3] = -y_d / 2
+ for i in range(4):
+ corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+ corners[2 * i +
+ 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def inter(rbbox1, rbbox2):
+ """Compute intersection of two rotated boxes.
+
+ Args:
+ rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+ rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+
+ Returns:
+ float: Intersection of two rotated boxes.
+ """
+ corners1 = cuda.local.array((8, ), dtype=numba.float32)
+ corners2 = cuda.local.array((8, ), dtype=numba.float32)
+ intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+
+ rbbox_to_corners(corners1, rbbox1)
+ rbbox_to_corners(corners2, rbbox2)
+
+ num_intersection = quadrilateral_intersection(corners1, corners2,
+ intersection_corners)
+ sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+ # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+ return area(intersection_corners, num_intersection)
+
+
+@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+ """Compute rotated iou on device.
+
+ Args:
+ rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+ rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+ criterion (int, optional): Indicate different type of iou.
+ -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+ 0 indicate `area_inter / area1`,
+ 1 indicate `area_inter / area2`.
+
+ Returns:
+ float: iou between two input boxes.
+ """
+ area1 = rbox1[2] * rbox1[3]
+ area2 = rbox2[2] * rbox2[3]
+ area_inter = inter(rbox1, rbox2)
+ if criterion == -1:
+ return area_inter / (area1 + area2 - area_inter)
+ elif criterion == 0:
+ return area_inter / area1
+ elif criterion == 1:
+ return area_inter / area2
+ else:
+ return area_inter
+
+
+@cuda.jit(
+ '(int64, int64, float32[:], float32[:], float32[:], int32)',
+ fastmath=False)
+def rotate_iou_kernel_eval(N,
+ K,
+ dev_boxes,
+ dev_query_boxes,
+ dev_iou,
+ criterion=-1):
+ """Kernel of computing rotated iou.
+
+ Args:
+ N (int): The number of boxes.
+ K (int): The number of query boxes.
+ dev_boxes (np.ndarray): Boxes on device.
+ dev_query_boxes (np.ndarray): Query boxes on device.
+ dev_iou (np.ndarray): Computed iou to return.
+ criterion (int, optional): Indicate different type of iou.
+ -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+ 0 indicate `area_inter / area1`,
+ 1 indicate `area_inter / area2`.
+ """
+ threadsPerBlock = 8 * 8
+ row_start = cuda.blockIdx.x
+ col_start = cuda.blockIdx.y
+ tx = cuda.threadIdx.x
+ row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+ col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+ block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+ block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+
+ dev_query_box_idx = threadsPerBlock * col_start + tx
+ dev_box_idx = threadsPerBlock * row_start + tx
+ if (tx < col_size):
+ block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+ block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+ block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+ block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+ block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+ if (tx < row_size):
+ block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+ block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+ block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+ block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+ block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+ cuda.syncthreads()
+ if tx < row_size:
+ for i in range(col_size):
+ offset = (
+ row_start * threadsPerBlock * K + col_start * threadsPerBlock +
+ tx * K + i)
+ dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+ block_boxes[tx * 5:tx * 5 + 5],
+ criterion)
+
+
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+ """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms
+ in one example with numba.cuda code). convert from [this project](
+ https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
+
+ Args:
+ boxes (torch.Tensor): rbboxes. format: centers, dims,
+ angles(clockwise when positive) with the shape of [N, 5].
+ query_boxes (float tensor: [K, 5]): rbboxes to compute iou with boxes.
+ device_id (int, optional): Defaults to 0. Device to use.
+ criterion (int, optional): Indicate different type of iou.
+ -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+ 0 indicate `area_inter / area1`,
+ 1 indicate `area_inter / area2`.
+
+ Returns:
+ np.ndarray: IoU results.
+ """
+ boxes = boxes.astype(np.float32)
+ query_boxes = query_boxes.astype(np.float32)
+ N = boxes.shape[0]
+ K = query_boxes.shape[0]
+ iou = np.zeros((N, K), dtype=np.float32)
+ if N == 0 or K == 0:
+ return iou
+ threadsPerBlock = 8 * 8
+ cuda.select_device(device_id)
+ blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+
+ stream = cuda.stream()
+ with stream.auto_synchronize():
+ boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+ query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+ iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+ rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
+ stream](N, K, boxes_dev, query_boxes_dev,
+ iou_dev, criterion)
+ iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+ return iou.astype(boxes.dtype)
diff --git a/mmcv/core/evaluation/lyft_eval.py b/mmcv/core/evaluation/lyft_eval.py
new file mode 100644
index 0000000..bfb95a1
--- /dev/null
+++ b/mmcv/core/evaluation/lyft_eval.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap,
+ get_class_names,
+ get_ious,
+ group_by_key,
+ wrap_in_box)
+from mmcv.utils import print_log, track_iter_progress
+from mmcv.fileio.io import dump, load
+from os import path as osp
+from terminaltables import AsciiTable
+
+
+# def load_lyft_gts(lyft, data_root, eval_split, logger=None):
+# """Loads ground truth boxes from database.
+
+# Args:
+# lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+# data_root (str): Root of data for reading splits.
+# eval_split (str): Name of the split for evaluation.
+# logger (logging.Logger | str | None): Logger used for printing
+# related information during evaluation. Default: None.
+
+# Returns:
+# list[dict]: List of annotation dictionaries.
+# """
+# split_scenes = mmcv.list_from_file(
+# osp.join(data_root, f'{eval_split}.txt'))
+
+# # Read out all sample_tokens in DB.
+# sample_tokens_all = [s['token'] for s in lyft.sample]
+# assert len(sample_tokens_all) > 0, 'Error: Database has no samples!'
+
+# if eval_split == 'test':
+# # Check that you aren't trying to cheat :)
+# assert len(lyft.sample_annotation) > 0, \
+# 'Error: You are trying to evaluate on the test set \
+# but you do not have the annotations!'
+
+# sample_tokens = []
+# for sample_token in sample_tokens_all:
+# scene_token = lyft.get('sample', sample_token)['scene_token']
+# scene_record = lyft.get('scene', scene_token)
+# if scene_record['name'] in split_scenes:
+# sample_tokens.append(sample_token)
+
+# all_annotations = []
+
+# print_log('Loading ground truth annotations...', logger=logger)
+# # Load annotations and filter predictions and annotations.
+# for sample_token in track_iter_progress(sample_tokens):
+# sample = lyft.get('sample', sample_token)
+# sample_annotation_tokens = sample['anns']
+# for sample_annotation_token in sample_annotation_tokens:
+# # Get label name in detection task and filter unused labels.
+# sample_annotation = \
+# lyft.get('sample_annotation', sample_annotation_token)
+# detection_name = sample_annotation['category_name']
+# if detection_name is None:
+# continue
+# annotation = {
+# 'sample_token': sample_token,
+# 'translation': sample_annotation['translation'],
+# 'size': sample_annotation['size'],
+# 'rotation': sample_annotation['rotation'],
+# 'name': detection_name,
+# }
+# all_annotations.append(annotation)
+
+# return all_annotations
+
+
+def load_lyft_predictions(res_path):
+ """Load Lyft predictions from json file.
+
+ Args:
+ res_path (str): Path of result json file recording detections.
+
+ Returns:
+ list[dict]: List of prediction dictionaries.
+ """
+ predictions = load(res_path)
+ predictions = predictions['results']
+ all_preds = []
+ for sample_token in predictions.keys():
+ all_preds.extend(predictions[sample_token])
+ return all_preds
+
+
+def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None):
+ """Evaluation API for Lyft dataset.
+
+ Args:
+ lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+ data_root (str): Root of data for reading splits.
+ res_path (str): Path of result json file recording detections.
+ eval_set (str): Name of the split for evaluation.
+ output_dir (str): Output directory for output json files.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+
+ Returns:
+ dict[str, float]: The evaluation results.
+ """
+ # evaluate by lyft metrics
+ gts = load_lyft_gts(lyft, data_root, eval_set, logger)
+ predictions = load_lyft_predictions(res_path)
+
+ class_names = get_class_names(gts)
+ print('Calculating mAP@0.5:0.95...')
+
+ iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+ metrics = {}
+ average_precisions = \
+ get_classwise_aps(gts, predictions, class_names, iou_thresholds)
+ APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]]
+
+ mAPs = np.mean(average_precisions, axis=0)
+ mAPs_cate = np.mean(average_precisions, axis=1)
+ final_mAP = np.mean(mAPs)
+
+ metrics['average_precisions'] = average_precisions.tolist()
+ metrics['mAPs'] = mAPs.tolist()
+ metrics['Final mAP'] = float(final_mAP)
+ metrics['class_names'] = class_names
+ metrics['mAPs_cate'] = mAPs_cate.tolist()
+
+ APs_data = [['class', 'mAP@0.5:0.95']]
+ for i in range(len(class_names)):
+ row = [class_names[i], round(mAPs_cate[i], 3)]
+ APs_data.append(row)
+ APs_data.append(['Overall', round(final_mAP, 3)])
+ APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95')
+ APs_table.inner_footing_row_border = True
+ print_log(APs_table.table, logger=logger)
+
+ res_path = osp.join(output_dir, 'lyft_metrics.json')
+ dump(metrics, res_path)
+ return metrics
+
+
+def get_classwise_aps(gt, predictions, class_names, iou_thresholds):
+ """Returns an array with an average precision per class.
+
+ Note: Ground truth and predictions should have the following format.
+
+ .. code-block::
+
+ gt = [{
+ 'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+ fbb039a550991a5149214f98cec136ac',
+ 'translation': [974.2811881299899, 1714.6815014457964,
+ -23.689857123368846],
+ 'size': [1.796, 4.488, 1.664],
+ 'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121],
+ 'name': 'car'
+ }]
+
+ predictions = [{
+ 'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+ fbb039a550991a5149214f98cec136ac',
+ 'translation': [971.8343488872263, 1713.6816097857359,
+ -25.82534357061308],
+ 'size': [2.519726579986132, 7.810161372666739, 3.483438286096803],
+ 'rotation': [0.10913582721095375, 0.04099572636992043,
+ 0.01927712319721745, 1.029328402625659],
+ 'name': 'car',
+ 'score': 0.3077029437237213
+ }]
+
+ Args:
+ gt (list[dict]): list of dictionaries in the format described below.
+ predictions (list[dict]): list of dictionaries in the format
+ described below.
+ class_names (list[str]): list of the class names.
+ iou_thresholds (list[float]): IOU thresholds used to calculate
+ TP / FN
+
+ Returns:
+ np.ndarray: an array with an average precision per class.
+ """
+ assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds])
+
+ gt_by_class_name = group_by_key(gt, 'name')
+ pred_by_class_name = group_by_key(predictions, 'name')
+
+ average_precisions = np.zeros((len(class_names), len(iou_thresholds)))
+
+ for class_id, class_name in enumerate(class_names):
+ if class_name in pred_by_class_name:
+ recalls, precisions, average_precision = get_single_class_aps(
+ gt_by_class_name[class_name], pred_by_class_name[class_name],
+ iou_thresholds)
+ average_precisions[class_id, :] = average_precision
+
+ return average_precisions
+
+
+def get_single_class_aps(gt, predictions, iou_thresholds):
+ """Compute recall and precision for all iou thresholds. Adapted from
+ LyftDatasetDevkit.
+
+ Args:
+ gt (list[dict]): list of dictionaries in the format described above.
+ predictions (list[dict]): list of dictionaries in the format \
+ described below.
+ iou_thresholds (list[float]): IOU thresholds used to calculate \
+ TP / FN
+
+ Returns:
+ tuple[np.ndarray]: Returns (recalls, precisions, average precisions)
+ for each class.
+ """
+ num_gts = len(gt)
+ image_gts = group_by_key(gt, 'sample_token')
+ image_gts = wrap_in_box(image_gts)
+
+ sample_gt_checked = {
+ sample_token: np.zeros((len(boxes), len(iou_thresholds)))
+ for sample_token, boxes in image_gts.items()
+ }
+
+ predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
+
+ # go down dets and mark TPs and FPs
+ num_predictions = len(predictions)
+ tps = np.zeros((num_predictions, len(iou_thresholds)))
+ fps = np.zeros((num_predictions, len(iou_thresholds)))
+
+ for prediction_index, prediction in enumerate(predictions):
+ predicted_box = Box3D(**prediction)
+
+ sample_token = prediction['sample_token']
+
+ max_overlap = -np.inf
+ jmax = -1
+
+ if sample_token in image_gts:
+ gt_boxes = image_gts[sample_token]
+ # gt_boxes per sample
+ gt_checked = sample_gt_checked[sample_token]
+ # gt flags per sample
+ else:
+ gt_boxes = []
+ gt_checked = None
+
+ if len(gt_boxes) > 0:
+ overlaps = get_ious(gt_boxes, predicted_box)
+
+ max_overlap = np.max(overlaps)
+
+ jmax = np.argmax(overlaps)
+
+ for i, iou_threshold in enumerate(iou_thresholds):
+ if max_overlap > iou_threshold:
+ if gt_checked[jmax, i] == 0:
+ tps[prediction_index, i] = 1.0
+ gt_checked[jmax, i] = 1
+ else:
+ fps[prediction_index, i] = 1.0
+ else:
+ fps[prediction_index, i] = 1.0
+
+ # compute precision recall
+ fps = np.cumsum(fps, axis=0)
+ tps = np.cumsum(tps, axis=0)
+
+ recalls = tps / float(num_gts)
+ # avoid divide by zero in case the first detection
+ # matches a difficult ground truth
+ precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps)
+
+ aps = []
+ for i in range(len(iou_thresholds)):
+ recall = recalls[:, i]
+ precision = precisions[:, i]
+ assert np.all(0 <= recall) & np.all(recall <= 1)
+ assert np.all(0 <= precision) & np.all(precision <= 1)
+ ap = get_ap(recall, precision)
+ aps.append(ap)
+
+ aps = np.array(aps)
+
+ return recalls, precisions, aps
diff --git a/mmcv/core/evaluation/mean_ap.py b/mmcv/core/evaluation/mean_ap.py
new file mode 100644
index 0000000..dca238b
--- /dev/null
+++ b/mmcv/core/evaluation/mean_ap.py
@@ -0,0 +1,467 @@
+from multiprocessing import Pool
+import numpy as np
+from mmcv.utils import print_log, is_str
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+ """Calculate average precision (for single or multiple scales).
+
+ Args:
+ recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+ precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+ mode (str): 'area' or '11points', 'area' means calculating the area
+ under precision-recall curve, '11points' means calculating
+ the average precision of recalls at [0, 0.1, ..., 1]
+
+ Returns:
+ float or ndarray: calculated average precision
+ """
+ no_scale = False
+ if recalls.ndim == 1:
+ no_scale = True
+ recalls = recalls[np.newaxis, :]
+ precisions = precisions[np.newaxis, :]
+ assert recalls.shape == precisions.shape and recalls.ndim == 2
+ num_scales = recalls.shape[0]
+ ap = np.zeros(num_scales, dtype=np.float32)
+ if mode == 'area':
+ zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+ ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+ mrec = np.hstack((zeros, recalls, ones))
+ mpre = np.hstack((zeros, precisions, zeros))
+ for i in range(mpre.shape[1] - 1, 0, -1):
+ mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+ for i in range(num_scales):
+ ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+ ap[i] = np.sum(
+ (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+ elif mode == '11points':
+ for i in range(num_scales):
+ for thr in np.arange(0, 1 + 1e-3, 0.1):
+ precs = precisions[i, recalls[i, :] >= thr]
+ prec = precs.max() if precs.size > 0 else 0
+ ap[i] += prec
+ ap /= 11
+ else:
+ raise ValueError(
+ 'Unrecognized mode, only "area" and "11points" are supported')
+ if no_scale:
+ ap = ap[0]
+ return ap
+
+
+def tpfp_imagenet(det_bboxes,
+ gt_bboxes,
+ gt_bboxes_ignore=None,
+ default_iou_thr=0.5,
+ area_ranges=None):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ default_iou_thr (float): IoU threshold to be considered as matched for
+ medium and large bboxes (small ones have special rules).
+ Default: 0.5.
+ area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+ in the format [(min1, max1), (min2, max2), ...]. Default: None.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+ # an indicator of ignored gts
+ gt_ignore_inds = np.concatenate(
+ (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+ np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+ # stack gt_bboxes and gt_bboxes_ignore for convenience
+ gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+ num_dets = det_bboxes.shape[0]
+ num_gts = gt_bboxes.shape[0]
+ if area_ranges is None:
+ area_ranges = [(None, None)]
+ num_scales = len(area_ranges)
+ # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+ # of a certain scale.
+ tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+ fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+ if gt_bboxes.shape[0] == 0:
+ if area_ranges == [(None, None)]:
+ fp[...] = 1
+ else:
+ det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+ det_bboxes[:, 3] - det_bboxes[:, 1])
+ for i, (min_area, max_area) in enumerate(area_ranges):
+ fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+ return tp, fp
+ ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+ gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+ gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+ iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+ default_iou_thr)
+ # sort all detections by scores in descending order
+ sort_inds = np.argsort(-det_bboxes[:, -1])
+ for k, (min_area, max_area) in enumerate(area_ranges):
+ gt_covered = np.zeros(num_gts, dtype=bool)
+ # if no area range is specified, gt_area_ignore is all False
+ if min_area is None:
+ gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+ else:
+ gt_areas = gt_w * gt_h
+ gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+ for i in sort_inds:
+ max_iou = -1
+ matched_gt = -1
+ # find best overlapped available gt
+ for j in range(num_gts):
+ # different from PASCAL VOC: allow finding other gts if the
+ # best overlapped ones are already matched by other det bboxes
+ if gt_covered[j]:
+ continue
+ elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+ max_iou = ious[i, j]
+ matched_gt = j
+ # there are 4 cases for a det bbox:
+ # 1. it matches a gt, tp = 1, fp = 0
+ # 2. it matches an ignored gt, tp = 0, fp = 0
+ # 3. it matches no gt and within area range, tp = 0, fp = 1
+ # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+ if matched_gt >= 0:
+ gt_covered[matched_gt] = 1
+ if not (gt_ignore_inds[matched_gt]
+ or gt_area_ignore[matched_gt]):
+ tp[k, i] = 1
+ elif min_area is None:
+ fp[k, i] = 1
+ else:
+ bbox = det_bboxes[i, :4]
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+ if area >= min_area and area < max_area:
+ fp[k, i] = 1
+ return tp, fp
+
+
+def tpfp_default(det_bboxes,
+ gt_bboxes,
+ gt_bboxes_ignore=None,
+ iou_thr=0.5,
+ area_ranges=None):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+ in the format [(min1, max1), (min2, max2), ...]. Default: None.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+ # an indicator of ignored gts
+ gt_ignore_inds = np.concatenate(
+ (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+ np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+ # stack gt_bboxes and gt_bboxes_ignore for convenience
+ gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+ num_dets = det_bboxes.shape[0]
+ num_gts = gt_bboxes.shape[0]
+ if area_ranges is None:
+ area_ranges = [(None, None)]
+ num_scales = len(area_ranges)
+ # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+ # a certain scale
+ tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+ fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+ # if there is no gt bboxes in this image, then all det bboxes
+ # within area range are false positives
+ if gt_bboxes.shape[0] == 0:
+ if area_ranges == [(None, None)]:
+ fp[...] = 1
+ else:
+ det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+ det_bboxes[:, 3] - det_bboxes[:, 1])
+ for i, (min_area, max_area) in enumerate(area_ranges):
+ fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+ return tp, fp
+
+ ious = bbox_overlaps(det_bboxes, gt_bboxes)
+ # for each det, the max iou with all gts
+ ious_max = ious.max(axis=1)
+ # for each det, which gt overlaps most with it
+ ious_argmax = ious.argmax(axis=1)
+ # sort all dets in descending order by scores
+ sort_inds = np.argsort(-det_bboxes[:, -1])
+ for k, (min_area, max_area) in enumerate(area_ranges):
+ gt_covered = np.zeros(num_gts, dtype=bool)
+ # if no area range is specified, gt_area_ignore is all False
+ if min_area is None:
+ gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+ else:
+ gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+ gt_bboxes[:, 3] - gt_bboxes[:, 1])
+ gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+ for i in sort_inds:
+ if ious_max[i] >= iou_thr:
+ matched_gt = ious_argmax[i]
+ if not (gt_ignore_inds[matched_gt]
+ or gt_area_ignore[matched_gt]):
+ if not gt_covered[matched_gt]:
+ gt_covered[matched_gt] = True
+ tp[k, i] = 1
+ else:
+ fp[k, i] = 1
+ # otherwise ignore this detected bbox, tp = 0, fp = 0
+ elif min_area is None:
+ fp[k, i] = 1
+ else:
+ bbox = det_bboxes[i, :4]
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+ if area >= min_area and area < max_area:
+ fp[k, i] = 1
+ return tp, fp
+
+
+def get_cls_results(det_results, annotations, class_id):
+ """Get det results and gt information of a certain class.
+
+ Args:
+ det_results (list[list]): Same as `eval_map()`.
+ annotations (list[dict]): Same as `eval_map()`.
+ class_id (int): ID of a specific class.
+
+ Returns:
+ tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+ """
+ cls_dets = [img_res[class_id] for img_res in det_results]
+ cls_gts = []
+ cls_gts_ignore = []
+ for ann in annotations:
+ gt_inds = ann['labels'] == class_id
+ cls_gts.append(ann['bboxes'][gt_inds, :])
+
+ if ann.get('labels_ignore', None) is not None:
+ ignore_inds = ann['labels_ignore'] == class_id
+ cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+ else:
+ cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+ return cls_dets, cls_gts, cls_gts_ignore
+
+
+def eval_map(det_results,
+ annotations,
+ scale_ranges=None,
+ iou_thr=0.5,
+ dataset=None,
+ logger=None,
+ tpfp_fn=None,
+ nproc=4):
+ """Evaluate mAP of a dataset.
+
+ Args:
+ det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+ The outer list indicates images, and the inner list indicates
+ per-class detected bboxes.
+ annotations (list[dict]): Ground truth annotations where each item of
+ the list indicates an image. Keys of annotations are:
+
+ - `bboxes`: numpy array of shape (n, 4)
+ - `labels`: numpy array of shape (n, )
+ - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+ - `labels_ignore` (optional): numpy array of shape (k, )
+ scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+ in the format [(min1, max1), (min2, max2), ...]. A range of
+ (32, 64) means the area range between (32**2, 64**2).
+ Default: None.
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ dataset (list[str] | str | None): Dataset name or dataset classes,
+ there are minor differences in metrics for different datsets, e.g.
+ "voc07", "imagenet_det", etc. Default: None.
+ logger (logging.Logger | str | None): The way to print the mAP
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+ tpfp_fn (callable | None): The function used to determine true/
+ false positives. If None, :func:`tpfp_default` is used as default
+ unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+ case). If it is given as a function, then this function is used
+ to evaluate tp & fp. Default None.
+ nproc (int): Processes used for computing TP and FP.
+ Default: 4.
+
+ Returns:
+ tuple: (mAP, [dict, dict, ...])
+ """
+ assert len(det_results) == len(annotations)
+
+ num_imgs = len(det_results)
+ num_scales = len(scale_ranges) if scale_ranges is not None else 1
+ num_classes = len(det_results[0]) # positive class num
+ area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+ if scale_ranges is not None else None)
+
+ pool = Pool(nproc)
+ eval_results = []
+ for i in range(num_classes):
+ # get gt and det bboxes of this class
+ cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+ det_results, annotations, i)
+ # choose proper function according to datasets to compute tp and fp
+ if tpfp_fn is None:
+ if dataset in ['det', 'vid']:
+ tpfp_fn = tpfp_imagenet
+ else:
+ tpfp_fn = tpfp_default
+ if not callable(tpfp_fn):
+ raise ValueError(
+ f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+ # compute tp and fp for each image with multiple processes
+ tpfp = pool.starmap(
+ tpfp_fn,
+ zip(cls_dets, cls_gts, cls_gts_ignore,
+ [iou_thr for _ in range(num_imgs)],
+ [area_ranges for _ in range(num_imgs)]))
+ tp, fp = tuple(zip(*tpfp))
+ # calculate gt number of each scale
+ # ignored gts or gts beyond the specific scale are not counted
+ num_gts = np.zeros(num_scales, dtype=int)
+ for j, bbox in enumerate(cls_gts):
+ if area_ranges is None:
+ num_gts[0] += bbox.shape[0]
+ else:
+ gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
+ bbox[:, 3] - bbox[:, 1])
+ for k, (min_area, max_area) in enumerate(area_ranges):
+ num_gts[k] += np.sum((gt_areas >= min_area)
+ & (gt_areas < max_area))
+ # sort all det bboxes by score, also sort tp and fp
+ cls_dets = np.vstack(cls_dets)
+ num_dets = cls_dets.shape[0]
+ sort_inds = np.argsort(-cls_dets[:, -1])
+ tp = np.hstack(tp)[:, sort_inds]
+ fp = np.hstack(fp)[:, sort_inds]
+ # calculate recall and precision with tp and fp
+ tp = np.cumsum(tp, axis=1)
+ fp = np.cumsum(fp, axis=1)
+ eps = np.finfo(np.float32).eps
+ recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+ precisions = tp / np.maximum((tp + fp), eps)
+ # calculate AP
+ if scale_ranges is None:
+ recalls = recalls[0, :]
+ precisions = precisions[0, :]
+ num_gts = num_gts.item()
+ mode = 'area' if dataset != 'voc07' else '11points'
+ ap = average_precision(recalls, precisions, mode)
+ eval_results.append({
+ 'num_gts': num_gts,
+ 'num_dets': num_dets,
+ 'recall': recalls,
+ 'precision': precisions,
+ 'ap': ap
+ })
+ pool.close()
+ if scale_ranges is not None:
+ # shape (num_classes, num_scales)
+ all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+ all_num_gts = np.vstack(
+ [cls_result['num_gts'] for cls_result in eval_results])
+ mean_ap = []
+ for i in range(num_scales):
+ if np.any(all_num_gts[:, i] > 0):
+ mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+ else:
+ mean_ap.append(0.0)
+ else:
+ aps = []
+ for cls_result in eval_results:
+ if cls_result['num_gts'] > 0:
+ aps.append(cls_result['ap'])
+ mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+ print_map_summary(
+ mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+ return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+ results,
+ dataset=None,
+ scale_ranges=None,
+ logger=None):
+ """Print mAP and results of each class.
+
+ A table will be printed to show the gts/dets/recall/AP of each class and
+ the mAP.
+
+ Args:
+ mean_ap (float): Calculated from `eval_map()`.
+ results (list[dict]): Calculated from `eval_map()`.
+ dataset (list[str] | str | None): Dataset name or dataset classes.
+ scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+ logger (logging.Logger | str | None): The way to print the mAP
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+ """
+
+ if logger == 'silent':
+ return
+
+ if isinstance(results[0]['ap'], np.ndarray):
+ num_scales = len(results[0]['ap'])
+ else:
+ num_scales = 1
+
+ if scale_ranges is not None:
+ assert len(scale_ranges) == num_scales
+
+ num_classes = len(results)
+
+ recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+ aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+ num_gts = np.zeros((num_scales, num_classes), dtype=int)
+ for i, cls_result in enumerate(results):
+ if cls_result['recall'].size > 0:
+ recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+ aps[:, i] = cls_result['ap']
+ num_gts[:, i] = cls_result['num_gts']
+
+ if dataset is None:
+ label_names = [str(i) for i in range(num_classes)]
+ elif is_str(dataset):
+ label_names = get_classes(dataset)
+ else:
+ label_names = dataset
+
+ if not isinstance(mean_ap, list):
+ mean_ap = [mean_ap]
+
+ header = ['class', 'gts', 'dets', 'recall', 'ap']
+ for i in range(num_scales):
+ if scale_ranges is not None:
+ print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+ table_data = [header]
+ for j in range(num_classes):
+ row_data = [
+ label_names[j], num_gts[i, j], results[j]['num_dets'],
+ f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+ ]
+ table_data.append(row_data)
+ table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+ table = AsciiTable(table_data)
+ table.inner_footing_row_border = True
+ print_log('\n' + table.table, logger=logger)
diff --git a/mmcv/core/evaluation/metric_motion.py b/mmcv/core/evaluation/metric_motion.py
new file mode 100644
index 0000000..8219438
--- /dev/null
+++ b/mmcv/core/evaluation/metric_motion.py
@@ -0,0 +1,70 @@
+#
+
+"""This module evaluates the forecasted trajectories against the ground truth."""
+
+import math
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+
+LOW_PROB_THRESHOLD_FOR_METRICS = 0.05
+
+
+def get_ade(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float:
+ """Compute Average Displacement Error.
+ Args:
+ forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+ gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+ Returns:
+ ade: Average Displacement Error
+ """
+ pred_len = forecasted_trajectory.shape[0]
+ ade = float(
+ sum(
+ torch.sqrt(
+ (forecasted_trajectory[i, 0] - gt_trajectory[i, 0]) ** 2
+ + (forecasted_trajectory[i, 1] - gt_trajectory[i, 1]) ** 2
+ )
+ for i in range(pred_len)
+ )
+ / pred_len
+ )
+ return ade
+
+def get_best_preds(
+ forecasted_trajectory: torch.Tensor,
+ gt_trajectory: torch.Tensor
+) -> float:
+ """Compute min Average Displacement Error.
+ Args:
+ forecasted_trajectory: Predicted trajectory with shape [k, fut_ts, 2]
+ gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+ gt_fut_masks: Ground truth traj mask with shape (fut_ts)
+ Returns:
+ best_forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+ """
+
+ # [k, fut_ts]
+ dist = torch.linalg.norm(gt_trajectory[None] - forecasted_trajectory, dim=-1)
+ dist = dist[..., -1]
+ dist[torch.isnan(dist)] = 0
+ min_mode_idx = torch.argmin(dist, dim=-1)
+
+ return forecasted_trajectory[min_mode_idx]
+
+def get_fde(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float:
+ """Compute Final Displacement Error.
+ Args:
+ forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+ gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+ Returns:
+ fde: Final Displacement Error
+ """
+ fde = float(
+ torch.sqrt(
+ (forecasted_trajectory[-1, 0] - gt_trajectory[-1, 0]) ** 2
+ + (forecasted_trajectory[-1, 1] - gt_trajectory[-1, 1]) ** 2
+ )
+ )
+ return fde
diff --git a/mmcv/core/evaluation/metrics.py b/mmcv/core/evaluation/metrics.py
new file mode 100644
index 0000000..551203a
--- /dev/null
+++ b/mmcv/core/evaluation/metrics.py
@@ -0,0 +1,325 @@
+from collections import OrderedDict
+from mmcv.image import imread
+import numpy as np
+import torch
+
+
+def f_score(precision, recall, beta=1):
+ """calcuate the f-score value.
+
+ Args:
+ precision (float | torch.Tensor): The precision value.
+ recall (float | torch.Tensor): The recall value.
+ beta (int): Determines the weight of recall in the combined score.
+ Default: False.
+
+ Returns:
+ [torch.tensor]: The f-score value.
+ """
+ score = (1 + beta**2) * (precision * recall) / (
+ (beta**2 * precision) + recall)
+ return score
+
+
+def intersect_and_union(pred_label,
+ label,
+ num_classes,
+ ignore_index,
+ label_map=dict(),
+ reduce_zero_label=False):
+ """Calculate intersection and Union.
+
+ Args:
+ pred_label (ndarray | str): Prediction segmentation map
+ or predict result filename.
+ label (ndarray | str): Ground truth segmentation map
+ or label filename.
+ num_classes (int): Number of categories.
+ ignore_index (int): Index that will be ignored in evaluation.
+ label_map (dict): Mapping old labels to new labels. The parameter will
+ work only when label is str. Default: dict().
+ reduce_zero_label (bool): Wether ignore zero label. The parameter will
+ work only when label is str. Default: False.
+
+ Returns:
+ torch.Tensor: The intersection of prediction and ground truth
+ histogram on all classes.
+ torch.Tensor: The union of prediction and ground truth histogram on
+ all classes.
+ torch.Tensor: The prediction histogram on all classes.
+ torch.Tensor: The ground truth histogram on all classes.
+ """
+
+ if isinstance(pred_label, str):
+ pred_label = torch.from_numpy(np.load(pred_label))
+ else:
+ pred_label = torch.from_numpy((pred_label))
+
+ if isinstance(label, str):
+ label = torch.from_numpy(
+ imread(label, flag='unchanged', backend='pillow'))
+ else:
+ label = torch.from_numpy(label)
+
+ if label_map is not None:
+ for old_id, new_id in label_map.items():
+ label[label == old_id] = new_id
+ if reduce_zero_label:
+ label[label == 0] = 255
+ label = label - 1
+ label[label == 254] = 255
+
+ mask = (label != ignore_index)
+ pred_label = pred_label[mask]
+ label = label[mask]
+
+ intersect = pred_label[pred_label == label]
+ area_intersect = torch.histc(
+ intersect.float(), bins=(num_classes), min=0, max=num_classes - 1)
+ area_pred_label = torch.histc(
+ pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1)
+ area_label = torch.histc(
+ label.float(), bins=(num_classes), min=0, max=num_classes - 1)
+ area_union = area_pred_label + area_label - area_intersect
+ return area_intersect, area_union, area_pred_label, area_label
+
+
+def total_intersect_and_union(results,
+ gt_seg_maps,
+ num_classes,
+ ignore_index,
+ label_map=dict(),
+ reduce_zero_label=False):
+ """Calculate Total Intersection and Union.
+
+ Args:
+ results (list[ndarray] | list[str]): List of prediction segmentation
+ maps or list of prediction result filenames.
+ gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+ segmentation maps or list of label filenames.
+ num_classes (int): Number of categories.
+ ignore_index (int): Index that will be ignored in evaluation.
+ label_map (dict): Mapping old labels to new labels. Default: dict().
+ reduce_zero_label (bool): Wether ignore zero label. Default: False.
+
+ Returns:
+ ndarray: The intersection of prediction and ground truth histogram
+ on all classes.
+ ndarray: The union of prediction and ground truth histogram on all
+ classes.
+ ndarray: The prediction histogram on all classes.
+ ndarray: The ground truth histogram on all classes.
+ """
+ num_imgs = len(results)
+ assert len(gt_seg_maps) == num_imgs
+ total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64)
+ total_area_union = torch.zeros((num_classes, ), dtype=torch.float64)
+ total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64)
+ total_area_label = torch.zeros((num_classes, ), dtype=torch.float64)
+ for i in range(num_imgs):
+ area_intersect, area_union, area_pred_label, area_label = \
+ intersect_and_union(
+ results[i], gt_seg_maps[i], num_classes, ignore_index,
+ label_map, reduce_zero_label)
+ total_area_intersect += area_intersect
+ total_area_union += area_union
+ total_area_pred_label += area_pred_label
+ total_area_label += area_label
+ return total_area_intersect, total_area_union, total_area_pred_label, \
+ total_area_label
+
+
+def mean_iou(results,
+ gt_seg_maps,
+ num_classes,
+ ignore_index,
+ nan_to_num=None,
+ label_map=dict(),
+ reduce_zero_label=False):
+ """Calculate Mean Intersection and Union (mIoU)
+
+ Args:
+ results (list[ndarray] | list[str]): List of prediction segmentation
+ maps or list of prediction result filenames.
+ gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+ segmentation maps or list of label filenames.
+ num_classes (int): Number of categories.
+ ignore_index (int): Index that will be ignored in evaluation.
+ nan_to_num (int, optional): If specified, NaN values will be replaced
+ by the numbers defined by the user. Default: None.
+ label_map (dict): Mapping old labels to new labels. Default: dict().
+ reduce_zero_label (bool): Wether ignore zero label. Default: False.
+
+ Returns:
+ dict[str, float | ndarray]:
+ float: Overall accuracy on all images.
+ ndarray: Per category accuracy, shape (num_classes, ).
+ ndarray: Per category IoU, shape (num_classes, ).
+ """
+ iou_result = eval_metrics(
+ results=results,
+ gt_seg_maps=gt_seg_maps,
+ num_classes=num_classes,
+ ignore_index=ignore_index,
+ metrics=['mIoU'],
+ nan_to_num=nan_to_num,
+ label_map=label_map,
+ reduce_zero_label=reduce_zero_label)
+ return iou_result
+
+
+def mean_dice(results,
+ gt_seg_maps,
+ num_classes,
+ ignore_index,
+ nan_to_num=None,
+ label_map=dict(),
+ reduce_zero_label=False):
+ """Calculate Mean Dice (mDice)
+
+ Args:
+ results (list[ndarray] | list[str]): List of prediction segmentation
+ maps or list of prediction result filenames.
+ gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+ segmentation maps or list of label filenames.
+ num_classes (int): Number of categories.
+ ignore_index (int): Index that will be ignored in evaluation.
+ nan_to_num (int, optional): If specified, NaN values will be replaced
+ by the numbers defined by the user. Default: None.
+ label_map (dict): Mapping old labels to new labels. Default: dict().
+ reduce_zero_label (bool): Wether ignore zero label. Default: False.
+
+ Returns:
+ dict[str, float | ndarray]: Default metrics.
+ float: Overall accuracy on all images.
+ ndarray: Per category accuracy, shape (num_classes, ).
+ ndarray: Per category dice, shape (num_classes, ).
+ """
+
+ dice_result = eval_metrics(
+ results=results,
+ gt_seg_maps=gt_seg_maps,
+ num_classes=num_classes,
+ ignore_index=ignore_index,
+ metrics=['mDice'],
+ nan_to_num=nan_to_num,
+ label_map=label_map,
+ reduce_zero_label=reduce_zero_label)
+ return dice_result
+
+
+def mean_fscore(results,
+ gt_seg_maps,
+ num_classes,
+ ignore_index,
+ nan_to_num=None,
+ label_map=dict(),
+ reduce_zero_label=False,
+ beta=1):
+ """Calculate Mean Intersection and Union (mIoU)
+
+ Args:
+ results (list[ndarray] | list[str]): List of prediction segmentation
+ maps or list of prediction result filenames.
+ gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+ segmentation maps or list of label filenames.
+ num_classes (int): Number of categories.
+ ignore_index (int): Index that will be ignored in evaluation.
+ nan_to_num (int, optional): If specified, NaN values will be replaced
+ by the numbers defined by the user. Default: None.
+ label_map (dict): Mapping old labels to new labels. Default: dict().
+ reduce_zero_label (bool): Wether ignore zero label. Default: False.
+ beta (int): Determines the weight of recall in the combined score.
+ Default: False.
+
+
+ Returns:
+ dict[str, float | ndarray]: Default metrics.
+ float: Overall accuracy on all images.
+ ndarray: Per category recall, shape (num_classes, ).
+ ndarray: Per category precision, shape (num_classes, ).
+ ndarray: Per category f-score, shape (num_classes, ).
+ """
+ fscore_result = eval_metrics(
+ results=results,
+ gt_seg_maps=gt_seg_maps,
+ num_classes=num_classes,
+ ignore_index=ignore_index,
+ metrics=['mFscore'],
+ nan_to_num=nan_to_num,
+ label_map=label_map,
+ reduce_zero_label=reduce_zero_label,
+ beta=beta)
+ return fscore_result
+
+
+def eval_metrics(results,
+ gt_seg_maps,
+ num_classes,
+ ignore_index,
+ metrics=['mIoU'],
+ nan_to_num=None,
+ label_map=dict(),
+ reduce_zero_label=False,
+ beta=1):
+ """Calculate evaluation metrics
+ Args:
+ results (list[ndarray] | list[str]): List of prediction segmentation
+ maps or list of prediction result filenames.
+ gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+ segmentation maps or list of label filenames.
+ num_classes (int): Number of categories.
+ ignore_index (int): Index that will be ignored in evaluation.
+ metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'.
+ nan_to_num (int, optional): If specified, NaN values will be replaced
+ by the numbers defined by the user. Default: None.
+ label_map (dict): Mapping old labels to new labels. Default: dict().
+ reduce_zero_label (bool): Wether ignore zero label. Default: False.
+ Returns:
+ float: Overall accuracy on all images.
+ ndarray: Per category accuracy, shape (num_classes, ).
+ ndarray: Per category evaluation metrics, shape (num_classes, ).
+ """
+ if isinstance(metrics, str):
+ metrics = [metrics]
+ allowed_metrics = ['mIoU', 'mDice', 'mFscore']
+ if not set(metrics).issubset(set(allowed_metrics)):
+ raise KeyError('metrics {} is not supported'.format(metrics))
+
+ total_area_intersect, total_area_union, total_area_pred_label, \
+ total_area_label = total_intersect_and_union(
+ results, gt_seg_maps, num_classes, ignore_index, label_map,
+ reduce_zero_label)
+ all_acc = total_area_intersect.sum() / total_area_label.sum()
+ ret_metrics = OrderedDict({'aAcc': all_acc})
+ for metric in metrics:
+ if metric == 'mIoU':
+ iou = total_area_intersect / total_area_union
+ acc = total_area_intersect / total_area_label
+ ret_metrics['IoU'] = iou
+ ret_metrics['Acc'] = acc
+ elif metric == 'mDice':
+ dice = 2 * total_area_intersect / (
+ total_area_pred_label + total_area_label)
+ acc = total_area_intersect / total_area_label
+ ret_metrics['Dice'] = dice
+ ret_metrics['Acc'] = acc
+ elif metric == 'mFscore':
+ precision = total_area_intersect / total_area_pred_label
+ recall = total_area_intersect / total_area_label
+ f_value = torch.tensor(
+ [f_score(x[0], x[1], beta) for x in zip(precision, recall)])
+ ret_metrics['Fscore'] = f_value
+ ret_metrics['Precision'] = precision
+ ret_metrics['Recall'] = recall
+
+ ret_metrics = {
+ metric: value.numpy()
+ for metric, value in ret_metrics.items()
+ }
+ if nan_to_num is not None:
+ ret_metrics = OrderedDict({
+ metric: np.nan_to_num(metric_value, nan=nan_to_num)
+ for metric, metric_value in ret_metrics.items()
+ })
+ return ret_metrics
diff --git a/mmcv/core/evaluation/recall.py b/mmcv/core/evaluation/recall.py
new file mode 100644
index 0000000..23ec744
--- /dev/null
+++ b/mmcv/core/evaluation/recall.py
@@ -0,0 +1,189 @@
+from collections.abc import Sequence
+
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+ img_num = all_ious.shape[0]
+ total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+ _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+ for k, proposal_num in enumerate(proposal_nums):
+ tmp_ious = np.zeros(0)
+ for i in range(img_num):
+ ious = all_ious[i][:, :proposal_num].copy()
+ gt_ious = np.zeros((ious.shape[0]))
+ if ious.size == 0:
+ tmp_ious = np.hstack((tmp_ious, gt_ious))
+ continue
+ for j in range(ious.shape[0]):
+ gt_max_overlaps = ious.argmax(axis=1)
+ max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+ gt_idx = max_ious.argmax()
+ gt_ious[j] = max_ious[gt_idx]
+ box_idx = gt_max_overlaps[gt_idx]
+ ious[gt_idx, :] = -1
+ ious[:, box_idx] = -1
+ tmp_ious = np.hstack((tmp_ious, gt_ious))
+ _ious[k, :] = tmp_ious
+
+ _ious = np.fliplr(np.sort(_ious, axis=1))
+ recalls = np.zeros((proposal_nums.size, thrs.size))
+ for i, thr in enumerate(thrs):
+ recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+ return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+ """Check proposal_nums and iou_thrs and set correct format."""
+ if isinstance(proposal_nums, Sequence):
+ _proposal_nums = np.array(proposal_nums)
+ elif isinstance(proposal_nums, int):
+ _proposal_nums = np.array([proposal_nums])
+ else:
+ _proposal_nums = proposal_nums
+
+ if iou_thrs is None:
+ _iou_thrs = np.array([0.5])
+ elif isinstance(iou_thrs, Sequence):
+ _iou_thrs = np.array(iou_thrs)
+ elif isinstance(iou_thrs, float):
+ _iou_thrs = np.array([iou_thrs])
+ else:
+ _iou_thrs = iou_thrs
+
+ return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+ proposals,
+ proposal_nums=None,
+ iou_thrs=0.5,
+ logger=None):
+ """Calculate recalls.
+
+ Args:
+ gts (list[ndarray]): a list of arrays of shape (n, 4)
+ proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+ proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+ iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+ logger (logging.Logger | str | None): The way to print the recall
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+
+ Returns:
+ ndarray: recalls of different ious and proposal nums
+ """
+
+ img_num = len(gts)
+ assert img_num == len(proposals)
+
+ proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+ all_ious = []
+ for i in range(img_num):
+ if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+ scores = proposals[i][:, 4]
+ sort_idx = np.argsort(scores)[::-1]
+ img_proposal = proposals[i][sort_idx, :]
+ else:
+ img_proposal = proposals[i]
+ prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+ if gts[i] is None or gts[i].shape[0] == 0:
+ ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+ else:
+ ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4])
+ all_ious.append(ious)
+ all_ious = np.array(all_ious)
+ recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+ print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+ return recalls
+
+
+def print_recall_summary(recalls,
+ proposal_nums,
+ iou_thrs,
+ row_idxs=None,
+ col_idxs=None,
+ logger=None):
+ """Print recalls in a table.
+
+ Args:
+ recalls (ndarray): calculated from `bbox_recalls`
+ proposal_nums (ndarray or list): top N proposals
+ iou_thrs (ndarray or list): iou thresholds
+ row_idxs (ndarray): which rows(proposal nums) to print
+ col_idxs (ndarray): which cols(iou thresholds) to print
+ logger (logging.Logger | str | None): The way to print the recall
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+ """
+ proposal_nums = np.array(proposal_nums, dtype=np.int32)
+ iou_thrs = np.array(iou_thrs)
+ if row_idxs is None:
+ row_idxs = np.arange(proposal_nums.size)
+ if col_idxs is None:
+ col_idxs = np.arange(iou_thrs.size)
+ row_header = [''] + iou_thrs[col_idxs].tolist()
+ table_data = [row_header]
+ for i, num in enumerate(proposal_nums[row_idxs]):
+ row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+ row.insert(0, num)
+ table_data.append(row)
+ table = AsciiTable(table_data)
+ print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+ """Plot Proposal_num-Recalls curve.
+
+ Args:
+ recalls(ndarray or list): shape (k,)
+ proposal_nums(ndarray or list): same shape as `recalls`
+ """
+ if isinstance(proposal_nums, np.ndarray):
+ _proposal_nums = proposal_nums.tolist()
+ else:
+ _proposal_nums = proposal_nums
+ if isinstance(recalls, np.ndarray):
+ _recalls = recalls.tolist()
+ else:
+ _recalls = recalls
+
+ import matplotlib.pyplot as plt
+ f = plt.figure()
+ plt.plot([0] + _proposal_nums, [0] + _recalls)
+ plt.xlabel('Proposal num')
+ plt.ylabel('Recall')
+ plt.axis([0, proposal_nums.max(), 0, 1])
+ f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+ """Plot IoU-Recalls curve.
+
+ Args:
+ recalls(ndarray or list): shape (k,)
+ iou_thrs(ndarray or list): same shape as `recalls`
+ """
+ if isinstance(iou_thrs, np.ndarray):
+ _iou_thrs = iou_thrs.tolist()
+ else:
+ _iou_thrs = iou_thrs
+ if isinstance(recalls, np.ndarray):
+ _recalls = recalls.tolist()
+ else:
+ _recalls = recalls
+
+ import matplotlib.pyplot as plt
+ f = plt.figure()
+ plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+ plt.xlabel('IoU')
+ plt.ylabel('Recall')
+ plt.axis([iou_thrs.min(), 1, 0, 1])
+ f.show()
diff --git a/mmcv/core/evaluation/seg_eval.py b/mmcv/core/evaluation/seg_eval.py
new file mode 100644
index 0000000..542fedc
--- /dev/null
+++ b/mmcv/core/evaluation/seg_eval.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def fast_hist(preds, labels, num_classes):
+ """Compute the confusion matrix for every batch.
+
+ Args:
+ preds (np.ndarray): Prediction labels of points with shape of
+ (num_points, ).
+ labels (np.ndarray): Ground truth labels of points with shape of
+ (num_points, ).
+ num_classes (int): number of classes
+
+ Returns:
+ np.ndarray: Calculated confusion matrix.
+ """
+
+ k = (labels >= 0) & (labels < num_classes)
+ bin_count = np.bincount(
+ num_classes * labels[k].astype(int) + preds[k],
+ minlength=num_classes**2)
+ return bin_count[:num_classes**2].reshape(num_classes, num_classes)
+
+
+def per_class_iou(hist):
+ """Compute the per class iou.
+
+ Args:
+ hist(np.ndarray): Overall confusion martix
+ (num_classes, num_classes ).
+
+ Returns:
+ np.ndarray: Calculated per class iou
+ """
+
+ return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+
+
+def get_acc(hist):
+ """Compute the overall accuracy.
+
+ Args:
+ hist(np.ndarray): Overall confusion martix
+ (num_classes, num_classes ).
+
+ Returns:
+ float: Calculated overall acc
+ """
+
+ return np.diag(hist).sum() / hist.sum()
+
+
+def get_acc_cls(hist):
+ """Compute the class average accuracy.
+
+ Args:
+ hist(np.ndarray): Overall confusion martix
+ (num_classes, num_classes ).
+
+ Returns:
+ float: Calculated class average acc
+ """
+
+ return np.nanmean(np.diag(hist) / hist.sum(axis=1))
+
+
+def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None):
+ """Semantic Segmentation Evaluation.
+
+ Evaluate the result of the Semantic Segmentation.
+
+ Args:
+ gt_labels (list[torch.Tensor]): Ground truth labels.
+ seg_preds (list[torch.Tensor]): Predictions.
+ label2cat (dict): Map from label to category name.
+ ignore_index (int): Index that will be ignored in evaluation.
+ logger (logging.Logger | str | None): The way to print the mAP
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+
+ Returns:
+ dict[str, float]: Dict of results.
+ """
+ assert len(seg_preds) == len(gt_labels)
+ num_classes = len(label2cat)
+
+ hist_list = []
+ for i in range(len(gt_labels)):
+ gt_seg = gt_labels[i].clone().numpy().astype(np.int)
+ pred_seg = seg_preds[i].clone().numpy().astype(np.int)
+
+ # filter out ignored points
+ pred_seg[gt_seg == ignore_index] = -1
+ gt_seg[gt_seg == ignore_index] = -1
+
+ # calculate one instance result
+ hist_list.append(fast_hist(pred_seg, gt_seg, num_classes))
+
+ iou = per_class_iou(sum(hist_list))
+ miou = np.nanmean(iou)
+ acc = get_acc(sum(hist_list))
+ acc_cls = get_acc_cls(sum(hist_list))
+
+ header = ['classes']
+ for i in range(len(label2cat)):
+ header.append(label2cat[i])
+ header.extend(['miou', 'acc', 'acc_cls'])
+
+ ret_dict = dict()
+ table_columns = [['results']]
+ for i in range(len(label2cat)):
+ ret_dict[label2cat[i]] = float(iou[i])
+ table_columns.append([f'{iou[i]:.4f}'])
+ ret_dict['miou'] = float(miou)
+ ret_dict['acc'] = float(acc)
+ ret_dict['acc_cls'] = float(acc_cls)
+
+ table_columns.append([f'{miou:.4f}'])
+ table_columns.append([f'{acc:.4f}'])
+ table_columns.append([f'{acc_cls:.4f}'])
+
+ table_data = [header]
+ table_rows = list(zip(*table_columns))
+ table_data += table_rows
+ table = AsciiTable(table_data)
+ table.inner_footing_row_border = True
+ print_log('\n' + table.table, logger=logger)
+
+ return ret_dict
diff --git a/mmcv/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py b/mmcv/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
new file mode 100644
index 0000000..014b480
--- /dev/null
+++ b/mmcv/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+ `_.
+"""
+
+try:
+ from waymo_open_dataset import dataset_pb2 as open_dataset
+except ImportError:
+ raise ImportError(
+ 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+ 'to install the official devkit first.')
+
+from mmcv.utils import mkdir_or_exist, track_parallel_progress
+import numpy as np
+import tensorflow as tf
+from glob import glob
+from os.path import join
+from waymo_open_dataset import label_pb2
+from waymo_open_dataset.protos import metrics_pb2
+
+
+class KITTI2Waymo(object):
+ """KITTI predictions to Waymo converter.
+
+ This class serves as the converter to change predictions from KITTI to
+ Waymo format.
+
+ Args:
+ kitti_result_files (list[dict]): Predictions in KITTI format.
+ waymo_tfrecords_dir (str): Directory to load waymo raw data.
+ waymo_results_save_dir (str): Directory to save converted predictions
+ in waymo format (.bin files).
+ waymo_results_final_path (str): Path to save combined
+ predictions in waymo format (.bin file), like 'a/b/c.bin'.
+ prefix (str): Prefix of filename. In general, 0 for training, 1 for
+ validation and 2 for testing.
+ workers (str): Number of parallel processes.
+ """
+
+ def __init__(self,
+ kitti_result_files,
+ waymo_tfrecords_dir,
+ waymo_results_save_dir,
+ waymo_results_final_path,
+ prefix,
+ workers=64):
+
+ self.kitti_result_files = kitti_result_files
+ self.waymo_tfrecords_dir = waymo_tfrecords_dir
+ self.waymo_results_save_dir = waymo_results_save_dir
+ self.waymo_results_final_path = waymo_results_final_path
+ self.prefix = prefix
+ self.workers = int(workers)
+ self.name2idx = {}
+ for idx, result in enumerate(kitti_result_files):
+ if len(result['sample_idx']) > 0:
+ self.name2idx[str(result['sample_idx'][0])] = idx
+
+ # turn on eager execution for older tensorflow versions
+ if int(tf.__version__.split('.')[0]) < 2:
+ tf.enable_eager_execution()
+
+ self.k2w_cls_map = {
+ 'Car': label_pb2.Label.TYPE_VEHICLE,
+ 'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+ 'Sign': label_pb2.Label.TYPE_SIGN,
+ 'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+ }
+
+ self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
+ [-1.0, 0.0, 0.0, 0.0],
+ [0.0, -1.0, 0.0, 0.0],
+ [0.0, 0.0, 0.0, 1.0]])
+
+ self.get_file_names()
+ self.create_folder()
+
+ def get_file_names(self):
+ """Get file names of waymo raw data."""
+ self.waymo_tfrecord_pathnames = sorted(
+ glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
+ print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
+
+ def create_folder(self):
+ """Create folder for data conversion."""
+ mkdir_or_exist(self.waymo_results_save_dir)
+
+ def parse_objects(self, kitti_result, T_k2w, context_name,
+ frame_timestamp_micros):
+ """Parse one prediction with several instances in kitti format and
+ convert them to `Object` proto.
+
+ Args:
+ kitti_result (dict): Predictions in kitti format.
+
+ - name (np.ndarray): Class labels of predictions.
+ - dimensions (np.ndarray): Height, width, length of boxes.
+ - location (np.ndarray): Bottom center of boxes (x, y, z).
+ - rotation_y (np.ndarray): Orientation of boxes.
+ - score (np.ndarray): Scores of predictions.
+ T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
+ context_name (str): Context name of the frame.
+ frame_timestamp_micros (int): Frame timestamp.
+
+ Returns:
+ :obj:`Object`: Predictions in waymo dataset Object proto.
+ """
+
+ def parse_one_object(instance_idx):
+ """Parse one instance in kitti format and convert them to `Object`
+ proto.
+
+ Args:
+ instance_idx (int): Index of the instance to be converted.
+
+ Returns:
+ :obj:`Object`: Predicted instance in waymo dataset \
+ Object proto.
+ """
+ cls = kitti_result['name'][instance_idx]
+ length = round(kitti_result['dimensions'][instance_idx, 0], 4)
+ height = round(kitti_result['dimensions'][instance_idx, 1], 4)
+ width = round(kitti_result['dimensions'][instance_idx, 2], 4)
+ x = round(kitti_result['location'][instance_idx, 0], 4)
+ y = round(kitti_result['location'][instance_idx, 1], 4)
+ z = round(kitti_result['location'][instance_idx, 2], 4)
+ rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
+ score = round(kitti_result['score'][instance_idx], 4)
+
+ # y: downwards; move box origin from bottom center (kitti) to
+ # true center (waymo)
+ y -= height / 2
+ # frame transformation: kitti -> waymo
+ x, y, z = self.transform(T_k2w, x, y, z)
+
+ # different conventions
+ heading = -(rotation_y + np.pi / 2)
+ while heading < -np.pi:
+ heading += 2 * np.pi
+ while heading > np.pi:
+ heading -= 2 * np.pi
+
+ box = label_pb2.Label.Box()
+ box.center_x = x
+ box.center_y = y
+ box.center_z = z
+ box.length = length
+ box.width = width
+ box.height = height
+ box.heading = heading
+
+ o = metrics_pb2.Object()
+ o.object.box.CopyFrom(box)
+ o.object.type = self.k2w_cls_map[cls]
+ o.score = score
+
+ o.context_name = context_name
+ o.frame_timestamp_micros = frame_timestamp_micros
+
+ return o
+
+ objects = metrics_pb2.Objects()
+
+ for instance_idx in range(len(kitti_result['name'])):
+ o = parse_one_object(instance_idx)
+ objects.objects.append(o)
+
+ return objects
+
+ def convert_one(self, file_idx):
+ """Convert action for single file.
+
+ Args:
+ file_idx (int): Index of the file to be converted.
+ """
+ file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+ file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
+
+ for frame_num, frame_data in enumerate(file_data):
+ frame = open_dataset.Frame()
+ frame.ParseFromString(bytearray(frame_data.numpy()))
+
+ filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
+
+ for camera in frame.context.camera_calibrations:
+ # FRONT = 1, see dataset.proto for details
+ if camera.name == 1:
+ T_front_cam_to_vehicle = np.array(
+ camera.extrinsic.transform).reshape(4, 4)
+
+ T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
+
+ context_name = frame.context.name
+ frame_timestamp_micros = frame.timestamp_micros
+
+ if filename in self.name2idx:
+ kitti_result = \
+ self.kitti_result_files[self.name2idx[filename]]
+ objects = self.parse_objects(kitti_result, T_k2w, context_name,
+ frame_timestamp_micros)
+ else:
+ print(filename, 'not found.')
+ objects = metrics_pb2.Objects()
+
+ with open(
+ join(self.waymo_results_save_dir, f'{filename}.bin'),
+ 'wb') as f:
+ f.write(objects.SerializeToString())
+
+ def convert(self):
+ """Convert action."""
+ print('Start converting ...')
+ track_parallel_progress(self.convert_one, range(len(self)),
+ self.workers)
+ print('\nFinished ...')
+
+ # combine all files into one .bin
+ pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
+ combined = self.combine(pathnames)
+
+ with open(self.waymo_results_final_path, 'wb') as f:
+ f.write(combined.SerializeToString())
+
+ def __len__(self):
+ """Length of the filename list."""
+ return len(self.waymo_tfrecord_pathnames)
+
+ def transform(self, T, x, y, z):
+ """Transform the coordinates with matrix T.
+
+ Args:
+ T (np.ndarray): Transformation matrix.
+ x(float): Coordinate in x axis.
+ y(float): Coordinate in y axis.
+ z(float): Coordinate in z axis.
+
+ Returns:
+ list: Coordinates after transformation.
+ """
+ pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
+ pt_aft = np.matmul(T, pt_bef)
+ return pt_aft[:3].flatten().tolist()
+
+ def combine(self, pathnames):
+ """Combine predictions in waymo format for each sample together.
+
+ Args:
+ pathnames (str): Paths to save predictions.
+
+ Returns:
+ :obj:`Objects`: Combined predictions in Objects proto.
+ """
+ combined = metrics_pb2.Objects()
+
+ for pathname in pathnames:
+ objects = metrics_pb2.Objects()
+ with open(pathname, 'rb') as f:
+ objects.ParseFromString(f.read())
+ for o in objects.objects:
+ combined.objects.append(o)
+
+ return combined
diff --git a/mmcv/core/mask/__init__.py b/mmcv/core/mask/__init__.py
new file mode 100644
index 0000000..02cbbc7
--- /dev/null
+++ b/mmcv/core/mask/__init__.py
@@ -0,0 +1,6 @@
+from .mask_target import mask_target
+from .utils import encode_mask_results, split_combined_polys
+
+__all__ = [
+ 'split_combined_polys', 'mask_target', 'encode_mask_results'
+]
diff --git a/mmcv/core/mask/mask_target.py b/mmcv/core/mask/mask_target.py
new file mode 100644
index 0000000..e8f5461
--- /dev/null
+++ b/mmcv/core/mask/mask_target.py
@@ -0,0 +1,126 @@
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+ cfg):
+ """Compute mask target for positive proposals in multiple images.
+
+ Args:
+ pos_proposals_list (list[Tensor]): Positive proposals in multiple
+ images.
+ pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+ positive proposals.
+ gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+ each image.
+ cfg (dict): Config dict that specifies the mask size.
+
+ Returns:
+ list[Tensor]: Mask target of each image.
+
+ Example:
+ >>> import mmcv
+ >>> import mmdet
+ >>> from mmcv.core.mask import BitmapMasks
+ >>> from mmcv.core.mask.mask_target import *
+ >>> H, W = 17, 18
+ >>> cfg = mmcv.Config({'mask_size': (13, 14)})
+ >>> rng = np.random.RandomState(0)
+ >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image
+ >>> pos_proposals_list = [
+ >>> torch.Tensor([
+ >>> [ 7.2425, 5.5929, 13.9414, 14.9541],
+ >>> [ 7.3241, 3.6170, 16.3850, 15.3102],
+ >>> ]),
+ >>> torch.Tensor([
+ >>> [ 4.8448, 6.4010, 7.0314, 9.7681],
+ >>> [ 5.9790, 2.6989, 7.4416, 4.8580],
+ >>> [ 0.0000, 0.0000, 0.1398, 9.8232],
+ >>> ]),
+ >>> ]
+ >>> # Corresponding class index for each proposal for each image
+ >>> pos_assigned_gt_inds_list = [
+ >>> torch.LongTensor([7, 0]),
+ >>> torch.LongTensor([5, 4, 1]),
+ >>> ]
+ >>> # Ground truth mask for each true object for each image
+ >>> gt_masks_list = [
+ >>> BitmapMasks(rng.rand(8, H, W), height=H, width=W),
+ >>> BitmapMasks(rng.rand(6, H, W), height=H, width=W),
+ >>> ]
+ >>> mask_targets = mask_target(
+ >>> pos_proposals_list, pos_assigned_gt_inds_list,
+ >>> gt_masks_list, cfg)
+ >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+ """
+ cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+ mask_targets = map(mask_target_single, pos_proposals_list,
+ pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+ mask_targets = list(mask_targets)
+ if len(mask_targets) > 0:
+ mask_targets = torch.cat(mask_targets)
+ return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+ """Compute mask target for each positive proposal in the image.
+
+ Args:
+ pos_proposals (Tensor): Positive proposals.
+ pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+ gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+ or Polygon.
+ cfg (dict): Config dict that indicate the mask size.
+
+ Returns:
+ Tensor: Mask target of each positive proposals in the image.
+
+ Example:
+ >>> import mmcv
+ >>> import mmdet
+ >>> from mmcv.core.mask import BitmapMasks
+ >>> from mmcv.core.mask.mask_target import * # NOQA
+ >>> H, W = 32, 32
+ >>> cfg = mmcv.Config({'mask_size': (7, 11)})
+ >>> rng = np.random.RandomState(0)
+ >>> # Masks for each ground truth box (relative to the image)
+ >>> gt_masks_data = rng.rand(3, H, W)
+ >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W)
+ >>> # Predicted positive boxes in one image
+ >>> pos_proposals = torch.FloatTensor([
+ >>> [ 16.2, 5.5, 19.9, 20.9],
+ >>> [ 17.3, 13.6, 19.3, 19.3],
+ >>> [ 14.8, 16.4, 17.0, 23.7],
+ >>> [ 0.0, 0.0, 16.0, 16.0],
+ >>> [ 4.0, 0.0, 20.0, 16.0],
+ >>> ])
+ >>> # For each predicted proposal, its assignment to a gt mask
+ >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1])
+ >>> mask_targets = mask_target_single(
+ >>> pos_proposals, pos_assigned_gt_inds, gt_masks, cfg)
+ >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+ """
+ device = pos_proposals.device
+ mask_size = _pair(cfg.mask_size)
+ binarize = not cfg.get('soft_mask_target', False)
+ num_pos = pos_proposals.size(0)
+ if num_pos > 0:
+ proposals_np = pos_proposals.cpu().numpy()
+ maxh, maxw = gt_masks.height, gt_masks.width
+ proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+ proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+ pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+ mask_targets = gt_masks.crop_and_resize(
+ proposals_np,
+ mask_size,
+ device=device,
+ inds=pos_assigned_gt_inds,
+ binarize=binarize).to_ndarray()
+
+ mask_targets = torch.from_numpy(mask_targets).float().to(device)
+ else:
+ mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+ return mask_targets
diff --git a/mmcv/core/mask/structures.py b/mmcv/core/mask/structures.py
new file mode 100644
index 0000000..10d9155
--- /dev/null
+++ b/mmcv/core/mask/structures.py
@@ -0,0 +1,1037 @@
+from abc import ABCMeta, abstractmethod
+
+import cv2
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.ops.roi_align import roi_align
+from mmcv.image import rescale_size, imrescale, imresize, imflip, impad, imtranslate, imshear, imrotate
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+ """Base class for instance masks."""
+
+ @abstractmethod
+ def rescale(self, scale, interpolation='nearest'):
+ """Rescale masks as large as possible while keeping the aspect ratio.
+ For details can refer to `mmcv.imrescale`.
+
+ Args:
+ scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+ interpolation (str): Same as :func:`mmcv.imrescale`.
+
+ Returns:
+ BaseInstanceMasks: The rescaled masks.
+ """
+
+ @abstractmethod
+ def resize(self, out_shape, interpolation='nearest'):
+ """Resize masks to the given out_shape.
+
+ Args:
+ out_shape: Target (h, w) of resized mask.
+ interpolation (str): See :func:`mmcv.imresize`.
+
+ Returns:
+ BaseInstanceMasks: The resized masks.
+ """
+
+ @abstractmethod
+ def flip(self, flip_direction='horizontal'):
+ """Flip masks alone the given direction.
+
+ Args:
+ flip_direction (str): Either 'horizontal' or 'vertical'.
+
+ Returns:
+ BaseInstanceMasks: The flipped masks.
+ """
+
+ @abstractmethod
+ def pad(self, out_shape, pad_val):
+ """Pad masks to the given size of (h, w).
+
+ Args:
+ out_shape (tuple[int]): Target (h, w) of padded mask.
+ pad_val (int): The padded value.
+
+ Returns:
+ BaseInstanceMasks: The padded masks.
+ """
+
+ @abstractmethod
+ def crop(self, bbox):
+ """Crop each mask by the given bbox.
+
+ Args:
+ bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+ Return:
+ BaseInstanceMasks: The cropped masks.
+ """
+
+ @abstractmethod
+ def crop_and_resize(self,
+ bboxes,
+ out_shape,
+ inds,
+ device,
+ interpolation='bilinear',
+ binarize=True):
+ """Crop and resize masks by the given bboxes.
+
+ This function is mainly used in mask targets computation.
+ It firstly align mask to bboxes by assigned_inds, then crop mask by the
+ assigned bbox and resize to the size of (mask_h, mask_w)
+
+ Args:
+ bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+ out_shape (tuple[int]): Target (h, w) of resized mask
+ inds (ndarray): Indexes to assign masks to each bbox,
+ shape (N,) and values should be between [0, num_masks - 1].
+ device (str): Device of bboxes
+ interpolation (str): See `mmcv.imresize`
+ binarize (bool): if True fractional values are rounded to 0 or 1
+ after the resize operation. if False and unsupported an error
+ will be raised. Defaults to True.
+
+ Return:
+ BaseInstanceMasks: the cropped and resized masks.
+ """
+
+ @abstractmethod
+ def expand(self, expanded_h, expanded_w, top, left):
+ """see :class:`Expand`."""
+
+ @property
+ @abstractmethod
+ def areas(self):
+ """ndarray: areas of each instance."""
+
+ @abstractmethod
+ def to_ndarray(self):
+ """Convert masks to the format of ndarray.
+
+ Return:
+ ndarray: Converted masks in the format of ndarray.
+ """
+
+ @abstractmethod
+ def to_tensor(self, dtype, device):
+ """Convert masks to the format of Tensor.
+
+ Args:
+ dtype (str): Dtype of converted mask.
+ device (torch.device): Device of converted masks.
+
+ Returns:
+ Tensor: Converted masks in the format of Tensor.
+ """
+
+ @abstractmethod
+ def translate(self,
+ out_shape,
+ offset,
+ direction='horizontal',
+ fill_val=0,
+ interpolation='bilinear'):
+ """Translate the masks.
+
+ Args:
+ out_shape (tuple[int]): Shape for output mask, format (h, w).
+ offset (int | float): The offset for translate.
+ direction (str): The translate direction, either "horizontal"
+ or "vertical".
+ fill_val (int | float): Border value. Default 0.
+ interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+ Returns:
+ Translated masks.
+ """
+
+ def shear(self,
+ out_shape,
+ magnitude,
+ direction='horizontal',
+ border_value=0,
+ interpolation='bilinear'):
+ """Shear the masks.
+
+ Args:
+ out_shape (tuple[int]): Shape for output mask, format (h, w).
+ magnitude (int | float): The magnitude used for shear.
+ direction (str): The shear direction, either "horizontal"
+ or "vertical".
+ border_value (int | tuple[int]): Value used in case of a
+ constant border. Default 0.
+ interpolation (str): Same as in :func:`mmcv.imshear`.
+
+ Returns:
+ ndarray: Sheared masks.
+ """
+
+ @abstractmethod
+ def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+ """Rotate the masks.
+
+ Args:
+ out_shape (tuple[int]): Shape for output mask, format (h, w).
+ angle (int | float): Rotation angle in degrees. Positive values
+ mean counter-clockwise rotation.
+ center (tuple[float], optional): Center point (w, h) of the
+ rotation in source image. If not specified, the center of
+ the image will be used.
+ scale (int | float): Isotropic scale factor.
+ fill_val (int | float): Border value. Default 0 for masks.
+
+ Returns:
+ Rotated masks.
+ """
+
+
+class BitmapMasks(BaseInstanceMasks):
+ """This class represents masks in the form of bitmaps.
+
+ Args:
+ masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+ the number of objects.
+ height (int): height of masks
+ width (int): width of masks
+
+ Example:
+ >>> from mmcv.core.mask.structures import * # NOQA
+ >>> num_masks, H, W = 3, 32, 32
+ >>> rng = np.random.RandomState(0)
+ >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int)
+ >>> self = BitmapMasks(masks, height=H, width=W)
+
+ >>> # demo crop_and_resize
+ >>> num_boxes = 5
+ >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+ >>> out_shape = (14, 14)
+ >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+ >>> device = 'cpu'
+ >>> interpolation = 'bilinear'
+ >>> new = self.crop_and_resize(
+ ... bboxes, out_shape, inds, device, interpolation)
+ >>> assert len(new) == num_boxes
+ >>> assert new.height, new.width == out_shape
+ """
+
+ def __init__(self, masks, height, width):
+ self.height = height
+ self.width = width
+ if len(masks) == 0:
+ self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+ else:
+ assert isinstance(masks, (list, np.ndarray))
+ if isinstance(masks, list):
+ assert isinstance(masks[0], np.ndarray)
+ assert masks[0].ndim == 2 # (H, W)
+ else:
+ assert masks.ndim == 3 # (N, H, W)
+
+ self.masks = np.stack(masks).reshape(-1, height, width)
+ assert self.masks.shape[1] == self.height
+ assert self.masks.shape[2] == self.width
+
+ def __getitem__(self, index):
+ """Index the BitmapMask.
+
+ Args:
+ index (int | ndarray): Indices in the format of integer or ndarray.
+
+ Returns:
+ :obj:`BitmapMasks`: Indexed bitmap masks.
+ """
+ masks = self.masks[index].reshape(-1, self.height, self.width)
+ return BitmapMasks(masks, self.height, self.width)
+
+ def __iter__(self):
+ return iter(self.masks)
+
+ def __repr__(self):
+ s = self.__class__.__name__ + '('
+ s += f'num_masks={len(self.masks)}, '
+ s += f'height={self.height}, '
+ s += f'width={self.width})'
+ return s
+
+ def __len__(self):
+ """Number of masks."""
+ return len(self.masks)
+
+ def rescale(self, scale, interpolation='nearest'):
+ """See :func:`BaseInstanceMasks.rescale`."""
+ if len(self.masks) == 0:
+ new_w, new_h = rescale_size((self.width, self.height), scale)
+ rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+ else:
+ rescaled_masks = np.stack([
+ imrescale(mask, scale, interpolation=interpolation)
+ for mask in self.masks
+ ])
+ height, width = rescaled_masks.shape[1:]
+ return BitmapMasks(rescaled_masks, height, width)
+
+ def resize(self, out_shape, interpolation='nearest'):
+ """See :func:`BaseInstanceMasks.resize`."""
+ if len(self.masks) == 0:
+ resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+ else:
+ resized_masks = np.stack([
+ imresize(
+ mask, out_shape[::-1], interpolation=interpolation)
+ for mask in self.masks
+ ])
+ return BitmapMasks(resized_masks, *out_shape)
+
+ def flip(self, flip_direction='horizontal'):
+ """See :func:`BaseInstanceMasks.flip`."""
+ assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+ if len(self.masks) == 0:
+ flipped_masks = self.masks
+ else:
+ flipped_masks = np.stack([
+ imflip(mask, direction=flip_direction)
+ for mask in self.masks
+ ])
+ return BitmapMasks(flipped_masks, self.height, self.width)
+
+ def pad(self, out_shape, pad_val=0):
+ """See :func:`BaseInstanceMasks.pad`."""
+ if len(self.masks) == 0:
+ padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+ else:
+ padded_masks = np.stack([
+ impad(mask, shape=out_shape, pad_val=pad_val)
+ for mask in self.masks
+ ])
+ return BitmapMasks(padded_masks, *out_shape)
+
+ def crop(self, bbox):
+ """See :func:`BaseInstanceMasks.crop`."""
+ assert isinstance(bbox, np.ndarray)
+ assert bbox.ndim == 1
+
+ # clip the boundary
+ bbox = bbox.copy()
+ bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+ bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+ x1, y1, x2, y2 = bbox
+ w = np.maximum(x2 - x1, 1)
+ h = np.maximum(y2 - y1, 1)
+
+ if len(self.masks) == 0:
+ cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+ else:
+ cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+ return BitmapMasks(cropped_masks, h, w)
+
+ def crop_and_resize(self,
+ bboxes,
+ out_shape,
+ inds,
+ device='cpu',
+ interpolation='bilinear',
+ binarize=True):
+ """See :func:`BaseInstanceMasks.crop_and_resize`."""
+ if len(self.masks) == 0:
+ empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+ return BitmapMasks(empty_masks, *out_shape)
+
+ # convert bboxes to tensor
+ if isinstance(bboxes, np.ndarray):
+ bboxes = torch.from_numpy(bboxes).to(device=device)
+ if isinstance(inds, np.ndarray):
+ inds = torch.from_numpy(inds).to(device=device)
+
+ num_bbox = bboxes.shape[0]
+ fake_inds = torch.arange(
+ num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+ rois = torch.cat([fake_inds, bboxes], dim=1) # Nx5
+ rois = rois.to(device=device)
+ if num_bbox > 0:
+ gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+ 0, inds).to(dtype=rois.dtype)
+ targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+ 1.0, 0, 'avg', True).squeeze(1)
+ if binarize:
+ resized_masks = (targets >= 0.5).cpu().numpy()
+ else:
+ resized_masks = targets.cpu().numpy()
+ else:
+ resized_masks = []
+ return BitmapMasks(resized_masks, *out_shape)
+
+ def expand(self, expanded_h, expanded_w, top, left):
+ """See :func:`BaseInstanceMasks.expand`."""
+ if len(self.masks) == 0:
+ expanded_mask = np.empty((0, expanded_h, expanded_w),
+ dtype=np.uint8)
+ else:
+ expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+ dtype=np.uint8)
+ expanded_mask[:, top:top + self.height,
+ left:left + self.width] = self.masks
+ return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+ def translate(self,
+ out_shape,
+ offset,
+ direction='horizontal',
+ fill_val=0,
+ interpolation='bilinear'):
+ """Translate the BitmapMasks.
+
+ Args:
+ out_shape (tuple[int]): Shape for output mask, format (h, w).
+ offset (int | float): The offset for translate.
+ direction (str): The translate direction, either "horizontal"
+ or "vertical".
+ fill_val (int | float): Border value. Default 0 for masks.
+ interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+ Returns:
+ BitmapMasks: Translated BitmapMasks.
+
+ Example:
+ >>> from mmcv.core.mask.structures import BitmapMasks
+ >>> self = BitmapMasks.random(dtype=np.uint8)
+ >>> out_shape = (32, 32)
+ >>> offset = 4
+ >>> direction = 'horizontal'
+ >>> fill_val = 0
+ >>> interpolation = 'bilinear'
+ >>> # Note, There seem to be issues when:
+ >>> # * out_shape is different than self's shape
+ >>> # * the mask dtype is not supported by cv2.AffineWarp
+ >>> new = self.translate(out_shape, offset, direction, fill_val,
+ >>> interpolation)
+ >>> assert len(new) == len(self)
+ >>> assert new.height, new.width == out_shape
+ """
+ if len(self.masks) == 0:
+ translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+ else:
+ translated_masks = imtranslate(
+ self.masks.transpose((1, 2, 0)),
+ offset,
+ direction,
+ border_value=fill_val,
+ interpolation=interpolation)
+ if translated_masks.ndim == 2:
+ translated_masks = translated_masks[:, :, None]
+ translated_masks = translated_masks.transpose(
+ (2, 0, 1)).astype(self.masks.dtype)
+ return BitmapMasks(translated_masks, *out_shape)
+
+ def shear(self,
+ out_shape,
+ magnitude,
+ direction='horizontal',
+ border_value=0,
+ interpolation='bilinear'):
+ """Shear the BitmapMasks.
+
+ Args:
+ out_shape (tuple[int]): Shape for output mask, format (h, w).
+ magnitude (int | float): The magnitude used for shear.
+ direction (str): The shear direction, either "horizontal"
+ or "vertical".
+ border_value (int | tuple[int]): Value used in case of a
+ constant border.
+ interpolation (str): Same as in :func:`mmcv.imshear`.
+
+ Returns:
+ BitmapMasks: The sheared masks.
+ """
+ if len(self.masks) == 0:
+ sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+ else:
+ sheared_masks = imshear(
+ self.masks.transpose((1, 2, 0)),
+ magnitude,
+ direction,
+ border_value=border_value,
+ interpolation=interpolation)
+ if sheared_masks.ndim == 2:
+ sheared_masks = sheared_masks[:, :, None]
+ sheared_masks = sheared_masks.transpose(
+ (2, 0, 1)).astype(self.masks.dtype)
+ return BitmapMasks(sheared_masks, *out_shape)
+
+ def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+ """Rotate the BitmapMasks.
+
+ Args:
+ out_shape (tuple[int]): Shape for output mask, format (h, w).
+ angle (int | float): Rotation angle in degrees. Positive values
+ mean counter-clockwise rotation.
+ center (tuple[float], optional): Center point (w, h) of the
+ rotation in source image. If not specified, the center of
+ the image will be used.
+ scale (int | float): Isotropic scale factor.
+ fill_val (int | float): Border value. Default 0 for masks.
+
+ Returns:
+ BitmapMasks: Rotated BitmapMasks.
+ """
+ if len(self.masks) == 0:
+ rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+ else:
+ rotated_masks = imrotate(
+ self.masks.transpose((1, 2, 0)),
+ angle,
+ center=center,
+ scale=scale,
+ border_value=fill_val)
+ if rotated_masks.ndim == 2:
+ # case when only one mask, (h, w)
+ rotated_masks = rotated_masks[:, :, None] # (h, w, 1)
+ rotated_masks = rotated_masks.transpose(
+ (2, 0, 1)).astype(self.masks.dtype)
+ return BitmapMasks(rotated_masks, *out_shape)
+
+ @property
+ def areas(self):
+ """See :py:attr:`BaseInstanceMasks.areas`."""
+ return self.masks.sum((1, 2))
+
+ def to_ndarray(self):
+ """See :func:`BaseInstanceMasks.to_ndarray`."""
+ return self.masks
+
+ def to_tensor(self, dtype, device):
+ """See :func:`BaseInstanceMasks.to_tensor`."""
+ return torch.tensor(self.masks, dtype=dtype, device=device)
+
+ @classmethod
+ def random(cls,
+ num_masks=3,
+ height=32,
+ width=32,
+ dtype=np.uint8,
+ rng=None):
+ """Generate random bitmap masks for demo / testing purposes.
+
+ Example:
+ >>> from mmcv.core.mask.structures import BitmapMasks
+ >>> self = BitmapMasks.random()
+ >>> print('self = {}'.format(self))
+ self = BitmapMasks(num_masks=3, height=32, width=32)
+ """
+ from mmcv.utils.util_random import ensure_rng
+ rng = ensure_rng(rng)
+ masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype)
+ self = cls(masks, height=height, width=width)
+ return self
+
+
+class PolygonMasks(BaseInstanceMasks):
+ """This class represents masks in the form of polygons.
+
+ Polygons is a list of three levels. The first level of the list
+ corresponds to objects, the second level to the polys that compose the
+ object, the third level to the poly coordinates
+
+ Args:
+ masks (list[list[ndarray]]): The first level of the list
+ corresponds to objects, the second level to the polys that
+ compose the object, the third level to the poly coordinates
+ height (int): height of masks
+ width (int): width of masks
+
+ Example:
+ >>> from mmcv.core.mask.structures import * # NOQA
+ >>> masks = [
+ >>> [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ]
+ >>> ]
+ >>> height, width = 16, 16
+ >>> self = PolygonMasks(masks, height, width)
+
+ >>> # demo translate
+ >>> new = self.translate((16, 16), 4., direction='horizontal')
+ >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2])
+ >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4)
+
+ >>> # demo crop_and_resize
+ >>> num_boxes = 3
+ >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+ >>> out_shape = (16, 16)
+ >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+ >>> device = 'cpu'
+ >>> interpolation = 'bilinear'
+ >>> new = self.crop_and_resize(
+ ... bboxes, out_shape, inds, device, interpolation)
+ >>> assert len(new) == num_boxes
+ >>> assert new.height, new.width == out_shape
+ """
+
+ def __init__(self, masks, height, width):
+ assert isinstance(masks, list)
+ if len(masks) > 0:
+ assert isinstance(masks[0], list)
+ assert isinstance(masks[0][0], np.ndarray)
+
+ self.height = height
+ self.width = width
+ self.masks = masks
+
+ def __getitem__(self, index):
+ """Index the polygon masks.
+
+ Args:
+ index (ndarray | List): The indices.
+
+ Returns:
+ :obj:`PolygonMasks`: The indexed polygon masks.
+ """
+ if isinstance(index, np.ndarray):
+ index = index.tolist()
+ if isinstance(index, list):
+ masks = [self.masks[i] for i in index]
+ else:
+ try:
+ masks = self.masks[index]
+ except Exception:
+ raise ValueError(
+ f'Unsupported input of type {type(index)} for indexing!')
+ if len(masks) and isinstance(masks[0], np.ndarray):
+ masks = [masks] # ensure a list of three levels
+ return PolygonMasks(masks, self.height, self.width)
+
+ def __iter__(self):
+ return iter(self.masks)
+
+ def __repr__(self):
+ s = self.__class__.__name__ + '('
+ s += f'num_masks={len(self.masks)}, '
+ s += f'height={self.height}, '
+ s += f'width={self.width})'
+ return s
+
+ def __len__(self):
+ """Number of masks."""
+ return len(self.masks)
+
+ def rescale(self, scale, interpolation=None):
+ """see :func:`BaseInstanceMasks.rescale`"""
+ new_w, new_h = rescale_size((self.width, self.height), scale)
+ if len(self.masks) == 0:
+ rescaled_masks = PolygonMasks([], new_h, new_w)
+ else:
+ rescaled_masks = self.resize((new_h, new_w))
+ return rescaled_masks
+
+ def resize(self, out_shape, interpolation=None):
+ """see :func:`BaseInstanceMasks.resize`"""
+ if len(self.masks) == 0:
+ resized_masks = PolygonMasks([], *out_shape)
+ else:
+ h_scale = out_shape[0] / self.height
+ w_scale = out_shape[1] / self.width
+ resized_masks = []
+ for poly_per_obj in self.masks:
+ resized_poly = []
+ for p in poly_per_obj:
+ p = p.copy()
+ p[0::2] *= w_scale
+ p[1::2] *= h_scale
+ resized_poly.append(p)
+ resized_masks.append(resized_poly)
+ resized_masks = PolygonMasks(resized_masks, *out_shape)
+ return resized_masks
+
+ def flip(self, flip_direction='horizontal'):
+ """see :func:`BaseInstanceMasks.flip`"""
+ assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+ if len(self.masks) == 0:
+ flipped_masks = PolygonMasks([], self.height, self.width)
+ else:
+ flipped_masks = []
+ for poly_per_obj in self.masks:
+ flipped_poly_per_obj = []
+ for p in poly_per_obj:
+ p = p.copy()
+ if flip_direction == 'horizontal':
+ p[0::2] = self.width - p[0::2]
+ elif flip_direction == 'vertical':
+ p[1::2] = self.height - p[1::2]
+ else:
+ p[0::2] = self.width - p[0::2]
+ p[1::2] = self.height - p[1::2]
+ flipped_poly_per_obj.append(p)
+ flipped_masks.append(flipped_poly_per_obj)
+ flipped_masks = PolygonMasks(flipped_masks, self.height,
+ self.width)
+ return flipped_masks
+
+ def crop(self, bbox):
+ """see :func:`BaseInstanceMasks.crop`"""
+ assert isinstance(bbox, np.ndarray)
+ assert bbox.ndim == 1
+
+ # clip the boundary
+ bbox = bbox.copy()
+ bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+ bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+ x1, y1, x2, y2 = bbox
+ w = np.maximum(x2 - x1, 1)
+ h = np.maximum(y2 - y1, 1)
+
+ if len(self.masks) == 0:
+ cropped_masks = PolygonMasks([], h, w)
+ else:
+ cropped_masks = []
+ for poly_per_obj in self.masks:
+ cropped_poly_per_obj = []
+ for p in poly_per_obj:
+ # pycocotools will clip the boundary
+ p = p.copy()
+ p[0::2] -= bbox[0]
+ p[1::2] -= bbox[1]
+ cropped_poly_per_obj.append(p)
+ cropped_masks.append(cropped_poly_per_obj)
+ cropped_masks = PolygonMasks(cropped_masks, h, w)
+ return cropped_masks
+
+ def pad(self, out_shape, pad_val=0):
+ """padding has no effect on polygons`"""
+ return PolygonMasks(self.masks, *out_shape)
+
+ def expand(self, *args, **kwargs):
+ """TODO: Add expand for polygon"""
+ raise NotImplementedError
+
+ def crop_and_resize(self,
+ bboxes,
+ out_shape,
+ inds,
+ device='cpu',
+ interpolation='bilinear',
+ binarize=True):
+ """see :func:`BaseInstanceMasks.crop_and_resize`"""
+ out_h, out_w = out_shape
+ if len(self.masks) == 0:
+ return PolygonMasks([], out_h, out_w)
+
+ if not binarize:
+ raise ValueError('Polygons are always binary, '
+ 'setting binarize=False is unsupported')
+
+ resized_masks = []
+ for i in range(len(bboxes)):
+ mask = self.masks[inds[i]]
+ bbox = bboxes[i, :]
+ x1, y1, x2, y2 = bbox
+ w = np.maximum(x2 - x1, 1)
+ h = np.maximum(y2 - y1, 1)
+ h_scale = out_h / max(h, 0.1) # avoid too large scale
+ w_scale = out_w / max(w, 0.1)
+
+ resized_mask = []
+ for p in mask:
+ p = p.copy()
+ # crop
+ # pycocotools will clip the boundary
+ p[0::2] -= bbox[0]
+ p[1::2] -= bbox[1]
+
+ # resize
+ p[0::2] *= w_scale
+ p[1::2] *= h_scale
+ resized_mask.append(p)
+ resized_masks.append(resized_mask)
+ return PolygonMasks(resized_masks, *out_shape)
+
+ def translate(self,
+ out_shape,
+ offset,
+ direction='horizontal',
+ fill_val=None,
+ interpolation=None):
+ """Translate the PolygonMasks.
+
+ Example:
+ >>> self = PolygonMasks.random(dtype=np.int)
+ >>> out_shape = (self.height, self.width)
+ >>> new = self.translate(out_shape, 4., direction='horizontal')
+ >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2])
+ >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4) # noqa: E501
+ """
+ assert fill_val is None or fill_val == 0, 'Here fill_val is not '\
+ f'used, and defaultly should be None or 0. got {fill_val}.'
+ if len(self.masks) == 0:
+ translated_masks = PolygonMasks([], *out_shape)
+ else:
+ translated_masks = []
+ for poly_per_obj in self.masks:
+ translated_poly_per_obj = []
+ for p in poly_per_obj:
+ p = p.copy()
+ if direction == 'horizontal':
+ p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+ elif direction == 'vertical':
+ p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+ translated_poly_per_obj.append(p)
+ translated_masks.append(translated_poly_per_obj)
+ translated_masks = PolygonMasks(translated_masks, *out_shape)
+ return translated_masks
+
+ def shear(self,
+ out_shape,
+ magnitude,
+ direction='horizontal',
+ border_value=0,
+ interpolation='bilinear'):
+ """See :func:`BaseInstanceMasks.shear`."""
+ if len(self.masks) == 0:
+ sheared_masks = PolygonMasks([], *out_shape)
+ else:
+ sheared_masks = []
+ if direction == 'horizontal':
+ shear_matrix = np.stack([[1, magnitude],
+ [0, 1]]).astype(np.float32)
+ elif direction == 'vertical':
+ shear_matrix = np.stack([[1, 0], [magnitude,
+ 1]]).astype(np.float32)
+ for poly_per_obj in self.masks:
+ sheared_poly = []
+ for p in poly_per_obj:
+ p = np.stack([p[0::2], p[1::2]], axis=0) # [2, n]
+ new_coords = np.matmul(shear_matrix, p) # [2, n]
+ new_coords[0, :] = np.clip(new_coords[0, :], 0,
+ out_shape[1])
+ new_coords[1, :] = np.clip(new_coords[1, :], 0,
+ out_shape[0])
+ sheared_poly.append(
+ new_coords.transpose((1, 0)).reshape(-1))
+ sheared_masks.append(sheared_poly)
+ sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+ return sheared_masks
+
+ def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+ """See :func:`BaseInstanceMasks.rotate`."""
+ if len(self.masks) == 0:
+ rotated_masks = PolygonMasks([], *out_shape)
+ else:
+ rotated_masks = []
+ rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+ for poly_per_obj in self.masks:
+ rotated_poly = []
+ for p in poly_per_obj:
+ p = p.copy()
+ coords = np.stack([p[0::2], p[1::2]], axis=1) # [n, 2]
+ # pad 1 to convert from format [x, y] to homogeneous
+ # coordinates format [x, y, 1]
+ coords = np.concatenate(
+ (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+ axis=1) # [n, 3]
+ rotated_coords = np.matmul(
+ rotate_matrix[None, :, :],
+ coords[:, :, None])[..., 0] # [n, 2, 1] -> [n, 2]
+ rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+ out_shape[1])
+ rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+ out_shape[0])
+ rotated_poly.append(rotated_coords.reshape(-1))
+ rotated_masks.append(rotated_poly)
+ rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+ return rotated_masks
+
+ def to_bitmap(self):
+ """convert polygon masks to bitmap masks."""
+ bitmap_masks = self.to_ndarray()
+ return BitmapMasks(bitmap_masks, self.height, self.width)
+
+ @property
+ def areas(self):
+ """Compute areas of masks.
+
+ This func is modified from `detectron2
+ `_.
+ The function only works with Polygons using the shoelace formula.
+
+ Return:
+ ndarray: areas of each instance
+ """ # noqa: W501
+ area = []
+ for polygons_per_obj in self.masks:
+ area_per_obj = 0
+ for p in polygons_per_obj:
+ area_per_obj += self._polygon_area(p[0::2], p[1::2])
+ area.append(area_per_obj)
+ return np.asarray(area)
+
+ def _polygon_area(self, x, y):
+ """Compute the area of a component of a polygon.
+
+ Using the shoelace formula:
+ https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+ Args:
+ x (ndarray): x coordinates of the component
+ y (ndarray): y coordinates of the component
+
+ Return:
+ float: the are of the component
+ """ # noqa: 501
+ return 0.5 * np.abs(
+ np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+ def to_ndarray(self):
+ """Convert masks to the format of ndarray."""
+ if len(self.masks) == 0:
+ return np.empty((0, self.height, self.width), dtype=np.uint8)
+ bitmap_masks = []
+ for poly_per_obj in self.masks:
+ bitmap_masks.append(
+ polygon_to_bitmap(poly_per_obj, self.height, self.width))
+ return np.stack(bitmap_masks)
+
+ def to_tensor(self, dtype, device):
+ """See :func:`BaseInstanceMasks.to_tensor`."""
+ if len(self.masks) == 0:
+ return torch.empty((0, self.height, self.width),
+ dtype=dtype,
+ device=device)
+ ndarray_masks = self.to_ndarray()
+ return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+ @classmethod
+ def random(cls,
+ num_masks=3,
+ height=32,
+ width=32,
+ n_verts=5,
+ dtype=np.float32,
+ rng=None):
+ """Generate random polygon masks for demo / testing purposes.
+
+ Adapted from [1]_
+
+ References:
+ .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379 # noqa: E501
+
+ Example:
+ >>> from mmcv.core.mask.structures import PolygonMasks
+ >>> self = PolygonMasks.random()
+ >>> print('self = {}'.format(self))
+ """
+ from mmcv.utils.util_random import ensure_rng
+ rng = ensure_rng(rng)
+
+ def _gen_polygon(n, irregularity, spikeyness):
+ """Creates the polygon by sampling points on a circle around the
+ centre. Random noise is added by varying the angular spacing
+ between sequential points, and by varying the radial distance of
+ each point from the centre.
+
+ Based on original code by Mike Ounsworth
+
+ Args:
+ n (int): number of vertices
+ irregularity (float): [0,1] indicating how much variance there
+ is in the angular spacing of vertices. [0,1] will map to
+ [0, 2pi/numberOfVerts]
+ spikeyness (float): [0,1] indicating how much variance there is
+ in each vertex from the circle of radius aveRadius. [0,1]
+ will map to [0, aveRadius]
+
+ Returns:
+ a list of vertices, in CCW order.
+ """
+ from scipy.stats import truncnorm
+ # Generate around the unit circle
+ cx, cy = (0.0, 0.0)
+ radius = 1
+
+ tau = np.pi * 2
+
+ irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n
+ spikeyness = np.clip(spikeyness, 1e-9, 1)
+
+ # generate n angle steps
+ lower = (tau / n) - irregularity
+ upper = (tau / n) + irregularity
+ angle_steps = rng.uniform(lower, upper, n)
+
+ # normalize the steps so that point 0 and point n+1 are the same
+ k = angle_steps.sum() / (2 * np.pi)
+ angles = (angle_steps / k).cumsum() + rng.uniform(0, tau)
+
+ # Convert high and low values to be wrt the standard normal range
+ # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
+ low = 0
+ high = 2 * radius
+ mean = radius
+ std = spikeyness
+ a = (low - mean) / std
+ b = (high - mean) / std
+ tnorm = truncnorm(a=a, b=b, loc=mean, scale=std)
+
+ # now generate the points
+ radii = tnorm.rvs(n, random_state=rng)
+ x_pts = cx + radii * np.cos(angles)
+ y_pts = cy + radii * np.sin(angles)
+
+ points = np.hstack([x_pts[:, None], y_pts[:, None]])
+
+ # Scale to 0-1 space
+ points = points - points.min(axis=0)
+ points = points / points.max(axis=0)
+
+ # Randomly place within 0-1 space
+ points = points * (rng.rand() * .8 + .2)
+ min_pt = points.min(axis=0)
+ max_pt = points.max(axis=0)
+
+ high = (1 - max_pt)
+ low = (0 - min_pt)
+ offset = (rng.rand(2) * (high - low)) + low
+ points = points + offset
+ return points
+
+ def _order_vertices(verts):
+ """
+ References:
+ https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise
+ """
+ mlat = verts.T[0].sum() / len(verts)
+ mlng = verts.T[1].sum() / len(verts)
+
+ tau = np.pi * 2
+ angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) +
+ tau) % tau
+ sortx = angle.argsort()
+ verts = verts.take(sortx, axis=0)
+ return verts
+
+ # Generate a random exterior for each requested mask
+ masks = []
+ for _ in range(num_masks):
+ exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9))
+ exterior = (exterior * [(width, height)]).astype(dtype)
+ masks.append([exterior.ravel()])
+
+ self = cls(masks, height, width)
+ return self
+
+
+def polygon_to_bitmap(polygons, height, width):
+ """Convert masks from the form of polygons to bitmaps.
+
+ Args:
+ polygons (list[ndarray]): masks in polygon representation
+ height (int): mask height
+ width (int): mask width
+
+ Return:
+ ndarray: the converted masks in bitmap representation
+ """
+ rles = maskUtils.frPyObjects(polygons, height, width)
+ rle = maskUtils.merge(rles)
+ bitmap_mask = maskUtils.decode(rle).astype(np.bool)
+ return bitmap_mask
diff --git a/mmcv/core/mask/utils.py b/mmcv/core/mask/utils.py
new file mode 100644
index 0000000..cc671b1
--- /dev/null
+++ b/mmcv/core/mask/utils.py
@@ -0,0 +1,63 @@
+from mmcv.utils import slice_list
+import numpy as np
+import pycocotools.mask as mask_util
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+ """Split the combined 1-D polys into masks.
+
+ A mask is represented as a list of polys, and a poly is represented as
+ a 1-D array. In dataset, all masks are concatenated into a single 1-D
+ tensor. Here we need to split the tensor into original representations.
+
+ Args:
+ polys (list): a list (length = image num) of 1-D tensors
+ poly_lens (list): a list (length = image num) of poly length
+ polys_per_mask (list): a list (length = image num) of poly number
+ of each mask
+
+ Returns:
+ list: a list (length = image num) of list (length = mask num) of \
+ list (length = poly num) of numpy array.
+ """
+ mask_polys_list = []
+ for img_id in range(len(polys)):
+ polys_single = polys[img_id]
+ polys_lens_single = poly_lens[img_id].tolist()
+ polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+ split_polys = slice_list(polys_single, polys_lens_single)
+ mask_polys = slice_list(split_polys, polys_per_mask_single)
+ mask_polys_list.append(mask_polys)
+ return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+ """Encode bitmap mask to RLE code.
+
+ Args:
+ mask_results (list | tuple[list]): bitmap mask results.
+ In mask scoring rcnn, mask_results is a tuple of (segm_results,
+ segm_cls_score).
+
+ Returns:
+ list | tuple: RLE encoded mask.
+ """
+ if isinstance(mask_results, tuple): # mask scoring
+ cls_segms, cls_mask_scores = mask_results
+ else:
+ cls_segms = mask_results
+ num_classes = len(cls_segms)
+ encoded_mask_results = [[] for _ in range(num_classes)]
+ for i in range(len(cls_segms)):
+ for cls_segm in cls_segms[i]:
+ encoded_mask_results[i].append(
+ mask_util.encode(
+ np.array(
+ cls_segm[:, :, np.newaxis], order='F',
+ dtype='uint8'))[0]) # encoded with RLE
+ if isinstance(mask_results, tuple):
+ return encoded_mask_results, cls_mask_scores
+ else:
+ return encoded_mask_results
diff --git a/mmcv/core/points/__init__.py b/mmcv/core/points/__init__.py
new file mode 100644
index 0000000..73d2d83
--- /dev/null
+++ b/mmcv/core/points/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+from .cam_points import CameraPoints
+from .depth_points import DepthPoints
+from .lidar_points import LiDARPoints
+
+__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
+
+
+def get_points_type(points_type):
+ """Get the class of points according to coordinate type.
+
+ Args:
+ points_type (str): The type of points coordinate.
+ The valid value are "CAMERA", "LIDAR", or "DEPTH".
+
+ Returns:
+ class: Points type.
+ """
+ if points_type == 'CAMERA':
+ points_cls = CameraPoints
+ elif points_type == 'LIDAR':
+ points_cls = LiDARPoints
+ elif points_type == 'DEPTH':
+ points_cls = DepthPoints
+ else:
+ raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"'
+ f' are supported, got {points_type}')
+
+ return points_cls
diff --git a/mmcv/core/points/base_points.py b/mmcv/core/points/base_points.py
new file mode 100644
index 0000000..31b8cec
--- /dev/null
+++ b/mmcv/core/points/base_points.py
@@ -0,0 +1,436 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import warnings
+from abc import abstractmethod
+
+
+class BasePoints(object):
+ """Base class for Points.
+
+ Args:
+ tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+ points_dim (int): Number of the dimension of a point.
+ Each row is (x, y, z). Default to 3.
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x points_dim.
+ points_dim (int): Integer indicating the dimension of a point.
+ Each row is (x, y, z, ...).
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+ rotation_axis (int): Default rotation axis for points rotation.
+ """
+
+ def __init__(self, tensor, points_dim=3, attribute_dims=None):
+ if isinstance(tensor, torch.Tensor):
+ device = tensor.device
+ else:
+ device = torch.device('cpu')
+ tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+ if tensor.numel() == 0:
+ # Use reshape, so we don't end up creating a new tensor that
+ # does not depend on the inputs (and consequently confuses jit)
+ tensor = tensor.reshape((0, points_dim)).to(
+ dtype=torch.float32, device=device)
+ assert tensor.dim() == 2 and tensor.size(-1) == \
+ points_dim, tensor.size()
+
+ self.tensor = tensor
+ self.points_dim = points_dim
+ self.attribute_dims = attribute_dims
+ self.rotation_axis = 0
+
+ @property
+ def coord(self):
+ """torch.Tensor: Coordinates of each point with size (N, 3)."""
+ return self.tensor[:, :3]
+
+ @coord.setter
+ def coord(self, tensor):
+ """Set the coordinates of each point."""
+ try:
+ tensor = tensor.reshape(self.shape[0], 3)
+ except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray
+ raise ValueError(f'got unexpected shape {tensor.shape}')
+ if not isinstance(tensor, torch.Tensor):
+ tensor = self.tensor.new_tensor(tensor)
+ self.tensor[:, :3] = tensor
+
+ @property
+ def height(self):
+ """torch.Tensor: A vector with height of each point."""
+ if self.attribute_dims is not None and \
+ 'height' in self.attribute_dims.keys():
+ return self.tensor[:, self.attribute_dims['height']]
+ else:
+ return None
+
+ @height.setter
+ def height(self, tensor):
+ """Set the height of each point."""
+ try:
+ tensor = tensor.reshape(self.shape[0])
+ except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray
+ raise ValueError(f'got unexpected shape {tensor.shape}')
+ if not isinstance(tensor, torch.Tensor):
+ tensor = self.tensor.new_tensor(tensor)
+ if self.attribute_dims is not None and \
+ 'height' in self.attribute_dims.keys():
+ self.tensor[:, self.attribute_dims['height']] = tensor
+ else:
+ # add height attribute
+ if self.attribute_dims is None:
+ self.attribute_dims = dict()
+ attr_dim = self.shape[1]
+ self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
+ self.attribute_dims.update(dict(height=attr_dim))
+ self.points_dim += 1
+
+ @property
+ def color(self):
+ """torch.Tensor: A vector with color of each point."""
+ if self.attribute_dims is not None and \
+ 'color' in self.attribute_dims.keys():
+ return self.tensor[:, self.attribute_dims['color']]
+ else:
+ return None
+
+ @color.setter
+ def color(self, tensor):
+ """Set the color of each point."""
+ try:
+ tensor = tensor.reshape(self.shape[0], 3)
+ except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray
+ raise ValueError(f'got unexpected shape {tensor.shape}')
+ if tensor.max() >= 256 or tensor.min() < 0:
+ warnings.warn('point got color value beyond [0, 255]')
+ if not isinstance(tensor, torch.Tensor):
+ tensor = self.tensor.new_tensor(tensor)
+ if self.attribute_dims is not None and \
+ 'color' in self.attribute_dims.keys():
+ self.tensor[:, self.attribute_dims['color']] = tensor
+ else:
+ # add color attribute
+ if self.attribute_dims is None:
+ self.attribute_dims = dict()
+ attr_dim = self.shape[1]
+ self.tensor = torch.cat([self.tensor, tensor], dim=1)
+ self.attribute_dims.update(
+ dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
+ self.points_dim += 3
+
+ @property
+ def shape(self):
+ """torch.Shape: Shape of points."""
+ return self.tensor.shape
+
+ def shuffle(self):
+ """Shuffle the points.
+
+ Returns:
+ torch.Tensor: The shuffled index.
+ """
+ idx = torch.randperm(self.__len__(), device=self.tensor.device)
+ self.tensor = self.tensor[idx]
+ return idx
+
+ def rotate(self, rotation, axis=None):
+ """Rotate points with the given rotation matrix or angle.
+
+ Args:
+ rotation (float, np.ndarray, torch.Tensor): Rotation matrix
+ or angle.
+ axis (int): Axis to rotate at. Defaults to None.
+ """
+ if not isinstance(rotation, torch.Tensor):
+ rotation = self.tensor.new_tensor(rotation)
+ assert rotation.shape == torch.Size([3, 3]) or \
+ rotation.numel() == 1, f'invalid rotation shape {rotation.shape}'
+
+ if axis is None:
+ axis = self.rotation_axis
+
+ if rotation.numel() == 1:
+ rot_sin = torch.sin(rotation)
+ rot_cos = torch.cos(rotation)
+ if axis == 1:
+ rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin],
+ [0, 1, 0],
+ [rot_sin, 0, rot_cos]])
+ elif axis == 2 or axis == -1:
+ rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0],
+ [rot_sin, rot_cos, 0],
+ [0, 0, 1]])
+ elif axis == 0:
+ rot_mat_T = rotation.new_tensor([[0, rot_cos, -rot_sin],
+ [0, rot_sin, rot_cos],
+ [1, 0, 0]])
+ else:
+ raise ValueError('axis should in range')
+ rot_mat_T = rot_mat_T.T
+ elif rotation.numel() == 9:
+ rot_mat_T = rotation
+ else:
+ raise NotImplementedError
+ self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+
+ return rot_mat_T
+
+ @abstractmethod
+ def flip(self, bev_direction='horizontal'):
+ """Flip the points in BEV along given BEV direction."""
+ pass
+
+ def translate(self, trans_vector):
+ """Translate points with the given translation vector.
+
+ Args:
+ trans_vector (np.ndarray, torch.Tensor): Translation
+ vector of size 3 or nx3.
+ """
+ if not isinstance(trans_vector, torch.Tensor):
+ trans_vector = self.tensor.new_tensor(trans_vector)
+ trans_vector = trans_vector.squeeze(0)
+ if trans_vector.dim() == 1:
+ assert trans_vector.shape[0] == 3
+ elif trans_vector.dim() == 2:
+ assert trans_vector.shape[0] == self.tensor.shape[0] and \
+ trans_vector.shape[1] == 3
+ else:
+ raise NotImplementedError(
+ f'Unsupported translation vector of shape {trans_vector.shape}'
+ )
+ self.tensor[:, :3] += trans_vector
+
+ def in_range_3d(self, point_range):
+ """Check whether the points are in the given range.
+
+ Args:
+ point_range (list | torch.Tensor): The range of point
+ (x_min, y_min, z_min, x_max, y_max, z_max)
+
+ Note:
+ In the original implementation of SECOND, checking whether
+ a box in the range checks whether the points are in a convex
+ polygon, we try to reduce the burden for simpler cases.
+
+ Returns:
+ torch.Tensor: A binary vector indicating whether each point is \
+ inside the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > point_range[0])
+ & (self.tensor[:, 1] > point_range[1])
+ & (self.tensor[:, 2] > point_range[2])
+ & (self.tensor[:, 0] < point_range[3])
+ & (self.tensor[:, 1] < point_range[4])
+ & (self.tensor[:, 2] < point_range[5]))
+ return in_range_flags
+
+ @abstractmethod
+ def in_range_bev(self, point_range):
+ """Check whether the points are in the given range.
+
+ Args:
+ point_range (list | torch.Tensor): The range of point
+ in order of (x_min, y_min, x_max, y_max).
+
+ Returns:
+ torch.Tensor: Indicating whether each point is inside \
+ the reference range.
+ """
+ pass
+
+ @abstractmethod
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`CoordMode`): The target Box mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BasePoints`: The converted box of the same type \
+ in the `dst` mode.
+ """
+ pass
+
+ def scale(self, scale_factor):
+ """Scale the points with horizontal and vertical scaling factors.
+
+ Args:
+ scale_factors (float): Scale factors to scale the points.
+ """
+ self.tensor[:, :3] *= scale_factor
+
+ def __getitem__(self, item):
+ """
+ Note:
+ The following usage are allowed:
+ 1. `new_points = points[3]`:
+ return a `Points` that contains only one point.
+ 2. `new_points = points[2:10]`:
+ return a slice of points.
+ 3. `new_points = points[vector]`:
+ where vector is a torch.BoolTensor with `length = len(points)`.
+ Nonzero elements in the vector will be selected.
+ 4. `new_points = points[3:11, vector]`:
+ return a slice of points and attribute dims.
+ 5. `new_points = points[4:12, 2]`:
+ return a slice of points with single attribute.
+ Note that the returned Points might share storage with this Points,
+ subject to Pytorch's indexing semantics.
+
+ Returns:
+ :obj:`BasePoints`: A new object of \
+ :class:`BasePoints` after indexing.
+ """
+ original_type = type(self)
+ if isinstance(item, int):
+ return original_type(
+ self.tensor[item].view(1, -1),
+ points_dim=self.points_dim,
+ attribute_dims=self.attribute_dims)
+ elif isinstance(item, tuple) and len(item) == 2:
+ if isinstance(item[1], slice):
+ start = 0 if item[1].start is None else item[1].start
+ stop = self.tensor.shape[1] if \
+ item[1].stop is None else item[1].stop
+ step = 1 if item[1].step is None else item[1].step
+ item = list(item)
+ item[1] = list(range(start, stop, step))
+ item = tuple(item)
+ elif isinstance(item[1], int):
+ item = list(item)
+ item[1] = [item[1]]
+ item = tuple(item)
+ p = self.tensor[item[0], item[1]]
+
+ keep_dims = list(
+ set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
+ if self.attribute_dims is not None:
+ attribute_dims = self.attribute_dims.copy()
+ for key in self.attribute_dims.keys():
+ cur_attribute_dims = attribute_dims[key]
+ if isinstance(cur_attribute_dims, int):
+ cur_attribute_dims = [cur_attribute_dims]
+ intersect_attr = list(
+ set(cur_attribute_dims).intersection(set(keep_dims)))
+ if len(intersect_attr) == 1:
+ attribute_dims[key] = intersect_attr[0]
+ elif len(intersect_attr) > 1:
+ attribute_dims[key] = intersect_attr
+ else:
+ attribute_dims.pop(key)
+ else:
+ attribute_dims = None
+ elif isinstance(item, (slice, np.ndarray, torch.Tensor)):
+ p = self.tensor[item]
+ attribute_dims = self.attribute_dims
+ else:
+ raise NotImplementedError(f'Invalid slice {item}!')
+
+ assert p.dim() == 2, \
+ f'Indexing on Points with {item} failed to return a matrix!'
+ return original_type(
+ p, points_dim=p.shape[1], attribute_dims=attribute_dims)
+
+ def __len__(self):
+ """int: Number of points in the current object."""
+ return self.tensor.shape[0]
+
+ def __repr__(self):
+ """str: Return a strings that describes the object."""
+ return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')'
+
+ @classmethod
+ def cat(cls, points_list):
+ """Concatenate a list of Points into a single Points.
+
+ Args:
+ points_list (list[:obj:`BasePoints`]): List of points.
+
+ Returns:
+ :obj:`BasePoints`: The concatenated Points.
+ """
+ assert isinstance(points_list, (list, tuple))
+ if len(points_list) == 0:
+ return cls(torch.empty(0))
+ assert all(isinstance(points, cls) for points in points_list)
+
+ # use torch.cat (v.s. layers.cat)
+ # so the returned points never share storage with input
+ cat_points = cls(
+ torch.cat([p.tensor for p in points_list], dim=0),
+ points_dim=points_list[0].tensor.shape[1],
+ attribute_dims=points_list[0].attribute_dims)
+ return cat_points
+
+ def to(self, device):
+ """Convert current points to a specific device.
+
+ Args:
+ device (str | :obj:`torch.device`): The name of the device.
+
+ Returns:
+ :obj:`BasePoints`: A new boxes object on the \
+ specific device.
+ """
+ original_type = type(self)
+ return original_type(
+ self.tensor.to(device),
+ points_dim=self.points_dim,
+ attribute_dims=self.attribute_dims)
+
+ def clone(self):
+ """Clone the Points.
+
+ Returns:
+ :obj:`BasePoints`: Box object with the same properties \
+ as self.
+ """
+ original_type = type(self)
+ return original_type(
+ self.tensor.clone(),
+ points_dim=self.points_dim,
+ attribute_dims=self.attribute_dims)
+
+ @property
+ def device(self):
+ """str: The device of the points are on."""
+ return self.tensor.device
+
+ def __iter__(self):
+ """Yield a point as a Tensor of shape (4,) at a time.
+
+ Returns:
+ torch.Tensor: A point of shape (4,).
+ """
+ yield from self.tensor
+
+ def new_point(self, data):
+ """Create a new point object with data.
+
+ The new point and its tensor has the similar properties \
+ as self and self.tensor, respectively.
+
+ Args:
+ data (torch.Tensor | numpy.array | list): Data to be copied.
+
+ Returns:
+ :obj:`BasePoints`: A new point object with ``data``, \
+ the object's other properties are similar to ``self``.
+ """
+ new_tensor = self.tensor.new_tensor(data) \
+ if not isinstance(data, torch.Tensor) else data.to(self.device)
+ original_type = type(self)
+ return original_type(
+ new_tensor,
+ points_dim=self.points_dim,
+ attribute_dims=self.attribute_dims)
diff --git a/mmcv/core/points/cam_points.py b/mmcv/core/points/cam_points.py
new file mode 100644
index 0000000..ba83cf0
--- /dev/null
+++ b/mmcv/core/points/cam_points.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class CameraPoints(BasePoints):
+ """Points of instances in CAM coordinates.
+
+ Args:
+ tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+ points_dim (int): Number of the dimension of a point.
+ Each row is (x, y, z). Default to 3.
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x points_dim.
+ points_dim (int): Integer indicating the dimension of a point.
+ Each row is (x, y, z, ...).
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+ rotation_axis (int): Default rotation axis for points rotation.
+ """
+
+ def __init__(self, tensor, points_dim=3, attribute_dims=None):
+ super(CameraPoints, self).__init__(
+ tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+ self.rotation_axis = 1
+
+ def flip(self, bev_direction='horizontal'):
+ """Flip the boxes in BEV along given BEV direction."""
+ if bev_direction == 'horizontal':
+ self.tensor[:, 0] = -self.tensor[:, 0]
+ elif bev_direction == 'vertical':
+ self.tensor[:, 2] = -self.tensor[:, 2]
+
+ def in_range_bev(self, point_range):
+ """Check whether the points are in the given range.
+
+ Args:
+ point_range (list | torch.Tensor): The range of point
+ in order of (x_min, y_min, x_max, y_max).
+
+ Returns:
+ torch.Tensor: Indicating whether each point is inside \
+ the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > point_range[0])
+ & (self.tensor[:, 2] > point_range[1])
+ & (self.tensor[:, 0] < point_range[2])
+ & (self.tensor[:, 2] < point_range[3]))
+ return in_range_flags
+
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`CoordMode`): The target Point mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BasePoints`: The converted point of the same type \
+ in the `dst` mode.
+ """
+ from mmcv.core.bbox import Coord3DMode
+ return Coord3DMode.convert_point(
+ point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/points/depth_points.py b/mmcv/core/points/depth_points.py
new file mode 100644
index 0000000..1b12299
--- /dev/null
+++ b/mmcv/core/points/depth_points.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class DepthPoints(BasePoints):
+ """Points of instances in DEPTH coordinates.
+
+ Args:
+ tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+ points_dim (int): Number of the dimension of a point.
+ Each row is (x, y, z). Default to 3.
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x points_dim.
+ points_dim (int): Integer indicating the dimension of a point.
+ Each row is (x, y, z, ...).
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+ rotation_axis (int): Default rotation axis for points rotation.
+ """
+
+ def __init__(self, tensor, points_dim=3, attribute_dims=None):
+ super(DepthPoints, self).__init__(
+ tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+ self.rotation_axis = 2
+
+ def flip(self, bev_direction='horizontal'):
+ """Flip the boxes in BEV along given BEV direction."""
+ if bev_direction == 'horizontal':
+ self.tensor[:, 0] = -self.tensor[:, 0]
+ elif bev_direction == 'vertical':
+ self.tensor[:, 1] = -self.tensor[:, 1]
+
+ def in_range_bev(self, point_range):
+ """Check whether the points are in the given range.
+
+ Args:
+ point_range (list | torch.Tensor): The range of point
+ in order of (x_min, y_min, x_max, y_max).
+
+ Returns:
+ torch.Tensor: Indicating whether each point is inside \
+ the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > point_range[0])
+ & (self.tensor[:, 1] > point_range[1])
+ & (self.tensor[:, 0] < point_range[2])
+ & (self.tensor[:, 1] < point_range[3]))
+ return in_range_flags
+
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`CoordMode`): The target Point mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BasePoints`: The converted point of the same type \
+ in the `dst` mode.
+ """
+ from mmcv.core.bbox import Coord3DMode
+ return Coord3DMode.convert_point(
+ point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/points/lidar_points.py b/mmcv/core/points/lidar_points.py
new file mode 100644
index 0000000..bbfddd9
--- /dev/null
+++ b/mmcv/core/points/lidar_points.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class LiDARPoints(BasePoints):
+ """Points of instances in LIDAR coordinates.
+
+ Args:
+ tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+ points_dim (int): Number of the dimension of a point.
+ Each row is (x, y, z). Default to 3.
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+
+ Attributes:
+ tensor (torch.Tensor): Float matrix of N x points_dim.
+ points_dim (int): Integer indicating the dimension of a point.
+ Each row is (x, y, z, ...).
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
+ dimension. Default to None.
+ rotation_axis (int): Default rotation axis for points rotation.
+ """
+
+ def __init__(self, tensor, points_dim=3, attribute_dims=None):
+ super(LiDARPoints, self).__init__(
+ tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+ self.rotation_axis = 2
+
+ def flip(self, bev_direction='horizontal'):
+ """Flip the boxes in BEV along given BEV direction."""
+ if bev_direction == 'horizontal':
+ self.tensor[:, 1] = -self.tensor[:, 1]
+ elif bev_direction == 'vertical':
+ self.tensor[:, 0] = -self.tensor[:, 0]
+
+ def in_range_bev(self, point_range):
+ """Check whether the points are in the given range.
+
+ Args:
+ point_range (list | torch.Tensor): The range of point
+ in order of (x_min, y_min, x_max, y_max).
+
+ Returns:
+ torch.Tensor: Indicating whether each point is inside \
+ the reference range.
+ """
+ in_range_flags = ((self.tensor[:, 0] > point_range[0])
+ & (self.tensor[:, 1] > point_range[1])
+ & (self.tensor[:, 0] < point_range[2])
+ & (self.tensor[:, 1] < point_range[3]))
+ return in_range_flags
+
+ def convert_to(self, dst, rt_mat=None):
+ """Convert self to ``dst`` mode.
+
+ Args:
+ dst (:obj:`CoordMode`): The target Point mode.
+ rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+ matrix between different coordinates. Defaults to None.
+ The conversion from `src` coordinates to `dst` coordinates
+ usually comes along the change of sensors, e.g., from camera
+ to LiDAR. This requires a transformation matrix.
+
+ Returns:
+ :obj:`BasePoints`: The converted point of the same type \
+ in the `dst` mode.
+ """
+ from mmcv.core.bbox import Coord3DMode
+ return Coord3DMode.convert_point(
+ point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/post_processing/__init__.py b/mmcv/core/post_processing/__init__.py
new file mode 100644
index 0000000..5335741
--- /dev/null
+++ b/mmcv/core/post_processing/__init__.py
@@ -0,0 +1,9 @@
+# from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+# merge_aug_scores, merge_aug_bboxes_3d)
+from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms, circle_nms
+
+# __all__ = [
+# 'merge_aug_bboxes',
+# 'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
+# 'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms'
+# ]
diff --git a/mmcv/core/post_processing/bbox_nms.py b/mmcv/core/post_processing/bbox_nms.py
new file mode 100644
index 0000000..1b3c77a
--- /dev/null
+++ b/mmcv/core/post_processing/bbox_nms.py
@@ -0,0 +1,170 @@
+import torch
+from mmcv.ops.nms import batched_nms
+
+from mmcv.core.bbox.iou_calculators import bbox_overlaps
+
+
+def multiclass_nms(multi_bboxes,
+ multi_scores,
+ score_thr,
+ nms_cfg,
+ max_num=-1,
+ score_factors=None,
+ return_inds=False):
+ """NMS for multi-class bboxes.
+
+ Args:
+ multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+ multi_scores (Tensor): shape (n, #class), where the last column
+ contains scores of the background class, but this will be ignored.
+ score_thr (float): bbox threshold, bboxes with scores lower than it
+ will not be considered.
+ nms_thr (float): NMS IoU threshold
+ max_num (int, optional): if there are more than max_num bboxes after
+ NMS, only top max_num will be kept. Default to -1.
+ score_factors (Tensor, optional): The factors multiplied to scores
+ before applying NMS. Default to None.
+ return_inds (bool, optional): Whether return the indices of kept
+ bboxes. Default to False.
+
+ Returns:
+ tuple: (dets, labels, indices (optional)), tensors of shape (k, 5),
+ (k), and (k). Dets are boxes with scores. Labels are 0-based.
+ """
+ num_classes = multi_scores.size(1) - 1
+ # exclude background category
+ if multi_bboxes.shape[1] > 4:
+ bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+ else:
+ bboxes = multi_bboxes[:, None].expand(
+ multi_scores.size(0), num_classes, 4)
+
+ scores = multi_scores[:, :-1]
+
+ labels = torch.arange(num_classes, dtype=torch.long)
+ labels = labels.view(1, -1).expand_as(scores)
+
+ bboxes = bboxes.reshape(-1, 4)
+ scores = scores.reshape(-1)
+ labels = labels.reshape(-1)
+
+ if not torch.onnx.is_in_onnx_export():
+ # NonZero not supported in TensorRT
+ # remove low scoring boxes
+ valid_mask = scores > score_thr
+ # multiply score_factor after threshold to preserve more bboxes, improve
+ # mAP by 1% for YOLOv3
+ if score_factors is not None:
+ # expand the shape to match original shape of score
+ score_factors = score_factors.view(-1, 1).expand(
+ multi_scores.size(0), num_classes)
+ score_factors = score_factors.reshape(-1)
+ scores = scores * score_factors
+
+ if not torch.onnx.is_in_onnx_export():
+ # NonZero not supported in TensorRT
+ inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+ bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+ else:
+ # TensorRT NMS plugin has invalid output filled with -1
+ # add dummy data to make detection output correct.
+ bboxes = torch.cat([bboxes, bboxes.new_zeros(1, 4)], dim=0)
+ scores = torch.cat([scores, scores.new_zeros(1)], dim=0)
+ labels = torch.cat([labels, labels.new_zeros(1)], dim=0)
+
+ if bboxes.numel() == 0:
+ if torch.onnx.is_in_onnx_export():
+ raise RuntimeError('[ONNX Error] Can not record NMS '
+ 'as it has not been executed this time')
+ dets = torch.cat([bboxes, scores[:, None]], -1)
+ if return_inds:
+ return dets, labels, inds
+ else:
+ return dets, labels
+
+ dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+ if max_num > 0:
+ dets = dets[:max_num]
+ keep = keep[:max_num]
+
+ if return_inds:
+ return dets, labels[keep], keep
+ else:
+ return dets, labels[keep]
+
+
+def fast_nms(multi_bboxes,
+ multi_scores,
+ multi_coeffs,
+ score_thr,
+ iou_thr,
+ top_k,
+ max_num=-1):
+ """Fast NMS in `YOLACT `_.
+
+ Fast NMS allows already-removed detections to suppress other detections so
+ that every instance can be decided to be kept or discarded in parallel,
+ which is not possible in traditional NMS. This relaxation allows us to
+ implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+ Args:
+ multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+ multi_scores (Tensor): shape (n, #class+1), where the last column
+ contains scores of the background class, but this will be ignored.
+ multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+ score_thr (float): bbox threshold, bboxes with scores lower than it
+ will not be considered.
+ iou_thr (float): IoU threshold to be considered as conflicted.
+ top_k (int): if there are more than top_k bboxes before NMS,
+ only top top_k will be kept.
+ max_num (int): if there are more than max_num bboxes after NMS,
+ only top max_num will be kept. If -1, keep all the bboxes.
+ Default: -1.
+
+ Returns:
+ tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
+ and (k, coeffs_dim). Dets are boxes with scores.
+ Labels are 0-based.
+ """
+
+ scores = multi_scores[:, :-1].t() # [#class, n]
+ scores, idx = scores.sort(1, descending=True)
+
+ idx = idx[:, :top_k].contiguous()
+ scores = scores[:, :top_k] # [#class, topk]
+ num_classes, num_dets = idx.size()
+ boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+ coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+ iou = bbox_overlaps(boxes, boxes) # [#class, topk, topk]
+ iou.triu_(diagonal=1)
+ iou_max, _ = iou.max(dim=1)
+
+ # Now just filter out the ones higher than the threshold
+ keep = iou_max <= iou_thr
+
+ # Second thresholding introduces 0.2 mAP gain at negligible time cost
+ keep *= scores > score_thr
+
+ # Assign each kept detection to its corresponding class
+ classes = torch.arange(
+ num_classes, device=boxes.device)[:, None].expand_as(keep)
+ classes = classes[keep]
+
+ boxes = boxes[keep]
+ coeffs = coeffs[keep]
+ scores = scores[keep]
+
+ # Only keep the top max_num highest scores across all classes
+ scores, idx = scores.sort(0, descending=True)
+ if max_num > 0:
+ idx = idx[:max_num]
+ scores = scores[:max_num]
+
+ classes = classes[idx]
+ boxes = boxes[idx]
+ coeffs = coeffs[idx]
+
+ cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+ return cls_dets, classes, coeffs
diff --git a/mmcv/core/post_processing/box3d_nms.py b/mmcv/core/post_processing/box3d_nms.py
new file mode 100644
index 0000000..8bede1b
--- /dev/null
+++ b/mmcv/core/post_processing/box3d_nms.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+import torch
+
+from mmcv.ops.iou3d_det.iou3d_utils import nms_gpu, nms_normal_gpu
+
+
+def box3d_multiclass_nms(mlvl_bboxes,
+ mlvl_bboxes_for_nms,
+ mlvl_scores,
+ score_thr,
+ max_num,
+ cfg,
+ mlvl_dir_scores=None,
+ mlvl_attr_scores=None,
+ mlvl_bboxes2d=None):
+ """Multi-class nms for 3D boxes.
+
+ Args:
+ mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M).
+ M is the dimensions of boxes.
+ mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape
+ (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes.
+ mlvl_scores (torch.Tensor): Multi-level boxes with shape
+ (N, C + 1). N is the number of boxes. C is the number of classes.
+ score_thr (float): Score thredhold to filter boxes with low
+ confidence.
+ max_num (int): Maximum number of boxes will be kept.
+ cfg (dict): Configuration dict of NMS.
+ mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
+ of direction classifier. Defaults to None.
+ mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
+ of attribute classifier. Defaults to None.
+ mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
+ boxes. Defaults to None.
+
+ Returns:
+ tuple[torch.Tensor]: Return results after nms, including 3D \
+ bounding boxes, scores, labels, direction scores, attribute \
+ scores (optional) and 2D bounding boxes (optional).
+ """
+ # do multi class nms
+ # the fg class id range: [0, num_classes-1]
+ num_classes = mlvl_scores.shape[1] - 1
+ bboxes = []
+ scores = []
+ labels = []
+ dir_scores = []
+ attr_scores = []
+ bboxes2d = []
+ for i in range(0, num_classes):
+ # get bboxes and scores of this class
+ cls_inds = mlvl_scores[:, i] > score_thr
+ if not cls_inds.any():
+ continue
+
+ _scores = mlvl_scores[cls_inds, i]
+ _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
+
+ if cfg.use_rotate_nms:
+ nms_func = nms_gpu
+ else:
+ nms_func = nms_normal_gpu
+
+ selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
+ _mlvl_bboxes = mlvl_bboxes[cls_inds, :]
+ bboxes.append(_mlvl_bboxes[selected])
+ scores.append(_scores[selected])
+ cls_label = mlvl_bboxes.new_full((len(selected), ),
+ i,
+ dtype=torch.long)
+ labels.append(cls_label)
+
+ if mlvl_dir_scores is not None:
+ _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
+ dir_scores.append(_mlvl_dir_scores[selected])
+ if mlvl_attr_scores is not None:
+ _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
+ attr_scores.append(_mlvl_attr_scores[selected])
+ if mlvl_bboxes2d is not None:
+ _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
+ bboxes2d.append(_mlvl_bboxes2d[selected])
+
+ if bboxes:
+ bboxes = torch.cat(bboxes, dim=0)
+ scores = torch.cat(scores, dim=0)
+ labels = torch.cat(labels, dim=0)
+ if mlvl_dir_scores is not None:
+ dir_scores = torch.cat(dir_scores, dim=0)
+ if mlvl_attr_scores is not None:
+ attr_scores = torch.cat(attr_scores, dim=0)
+ if mlvl_bboxes2d is not None:
+ bboxes2d = torch.cat(bboxes2d, dim=0)
+ if bboxes.shape[0] > max_num:
+ _, inds = scores.sort(descending=True)
+ inds = inds[:max_num]
+ bboxes = bboxes[inds, :]
+ labels = labels[inds]
+ scores = scores[inds]
+ if mlvl_dir_scores is not None:
+ dir_scores = dir_scores[inds]
+ if mlvl_attr_scores is not None:
+ attr_scores = attr_scores[inds]
+ if mlvl_bboxes2d is not None:
+ bboxes2d = bboxes2d[inds]
+ else:
+ bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
+ scores = mlvl_scores.new_zeros((0, ))
+ labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
+ if mlvl_dir_scores is not None:
+ dir_scores = mlvl_scores.new_zeros((0, ))
+ if mlvl_attr_scores is not None:
+ attr_scores = mlvl_scores.new_zeros((0, ))
+ if mlvl_bboxes2d is not None:
+ bboxes2d = mlvl_scores.new_zeros((0, 4))
+
+ results = (bboxes, scores, labels)
+
+ if mlvl_dir_scores is not None:
+ results = results + (dir_scores, )
+ if mlvl_attr_scores is not None:
+ results = results + (attr_scores, )
+ if mlvl_bboxes2d is not None:
+ results = results + (bboxes2d, )
+
+ return results
+
+
+def aligned_3d_nms(boxes, scores, classes, thresh):
+ """3d nms for aligned boxes.
+
+ Args:
+ boxes (torch.Tensor): Aligned box with shape [n, 6].
+ scores (torch.Tensor): Scores of each box.
+ classes (torch.Tensor): Class of each box.
+ thresh (float): Iou threshold for nms.
+
+ Returns:
+ torch.Tensor: Indices of selected boxes.
+ """
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ z1 = boxes[:, 2]
+ x2 = boxes[:, 3]
+ y2 = boxes[:, 4]
+ z2 = boxes[:, 5]
+ area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+ zero = boxes.new_zeros(1, )
+
+ score_sorted = torch.argsort(scores)
+ pick = []
+ while (score_sorted.shape[0] != 0):
+ last = score_sorted.shape[0]
+ i = score_sorted[-1]
+ pick.append(i)
+
+ xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+ yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+ zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+ xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+ yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+ zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+ classes1 = classes[i]
+ classes2 = classes[score_sorted[:last - 1]]
+ inter_l = torch.max(zero, xx2 - xx1)
+ inter_w = torch.max(zero, yy2 - yy1)
+ inter_h = torch.max(zero, zz2 - zz1)
+
+ inter = inter_l * inter_w * inter_h
+ iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+ iou = iou * (classes1 == classes2).float()
+ score_sorted = score_sorted[torch.nonzero(
+ iou <= thresh, as_tuple=False).flatten()]
+
+ indices = boxes.new_tensor(pick, dtype=torch.long)
+ return indices
+
+
+@numba.jit(nopython=True)
+def circle_nms(dets, thresh, post_max_size=83):
+ """Circular NMS.
+
+ An object is only counted as positive if no other center
+ with a higher confidence exists within a radius r using a
+ bird-eye view distance metric.
+
+ Args:
+ dets (torch.Tensor): Detection results with the shape of [N, 3].
+ thresh (float): Value of threshold.
+ post_max_size (int): Max number of prediction to be kept. Defaults
+ to 83
+
+ Returns:
+ torch.Tensor: Indexes of the detections to be kept.
+ """
+ x1 = dets[:, 0]
+ y1 = dets[:, 1]
+ scores = dets[:, 2]
+ order = scores.argsort()[::-1].astype(np.int32) # highest->lowest
+ ndets = dets.shape[0]
+ suppressed = np.zeros((ndets), dtype=np.int32)
+ keep = []
+ for _i in range(ndets):
+ i = order[_i] # start with highest score box
+ if suppressed[
+ i] == 1: # if any box have enough iou with this, remove it
+ continue
+ keep.append(i)
+ for _j in range(_i + 1, ndets):
+ j = order[_j]
+ if suppressed[j] == 1:
+ continue
+ # calculate center distance between i and j box
+ dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2
+
+ # ovr = inter / areas[j]
+ if dist <= thresh:
+ suppressed[j] = 1
+ return keep[:post_max_size]
diff --git a/mmcv/core/post_processing/merge_augs.py b/mmcv/core/post_processing/merge_augs.py
new file mode 100644
index 0000000..e96dc3b
--- /dev/null
+++ b/mmcv/core/post_processing/merge_augs.py
@@ -0,0 +1,241 @@
+import copy
+import warnings
+
+import numpy as np
+import torch
+from mmcv import ConfigDict
+from mmcv.ops import nms
+
+from mmcv.ops.iou3d_det.iou3d_utils import nms_gpu, nms_normal_gpu
+from ..bbox.transforms import bbox_mapping_back, bbox3d2result, bbox3d_mapping_back
+from ..bbox.structures.utils import xywhr2xyxyr
+
+def merge_aug_proposals(aug_proposals, img_metas, cfg):
+ """Merge augmented proposals (multiscale, flip, etc.)
+
+ Args:
+ aug_proposals (list[Tensor]): proposals from different testing
+ schemes, shape (n, 5). Note that they are not rescaled to the
+ original image size.
+
+ img_metas (list[dict]): list of image info dict where each dict has:
+ 'img_shape', 'scale_factor', 'flip', and may also contain
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+ For details on the values of these keys see
+ `mmcv/datasets/pipelines/formatting.py:Collect`.
+
+ cfg (dict): rpn test config.
+
+ Returns:
+ Tensor: shape (n, 4), proposals corresponding to original image scale.
+ """
+
+ cfg = copy.deepcopy(cfg)
+
+ # deprecate arguments warning
+ if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+ warnings.warn(
+ 'In rpn_proposal or test_cfg, '
+ 'nms_thr has been moved to a dict named nms as '
+ 'iou_threshold, max_num has been renamed as max_per_img, '
+ 'name of original arguments and the way to specify '
+ 'iou_threshold of NMS will be deprecated.')
+ if 'nms' not in cfg:
+ cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+ if 'max_num' in cfg:
+ if 'max_per_img' in cfg:
+ assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
+ f'max_per_img at the same time, but get {cfg.max_num} ' \
+ f'and {cfg.max_per_img} respectively' \
+ f'Please delete max_num which will be deprecated.'
+ else:
+ cfg.max_per_img = cfg.max_num
+ if 'nms_thr' in cfg:
+ assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+ f'iou_threshold in nms and ' \
+ f'nms_thr at the same time, but get ' \
+ f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+ f' respectively. Please delete the nms_thr ' \
+ f'which will be deprecated.'
+
+ recovered_proposals = []
+ for proposals, img_info in zip(aug_proposals, img_metas):
+ img_shape = img_info['img_shape']
+ scale_factor = img_info['scale_factor']
+ flip = img_info['flip']
+ flip_direction = img_info['flip_direction']
+ _proposals = proposals.clone()
+ _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+ scale_factor, flip,
+ flip_direction)
+ recovered_proposals.append(_proposals)
+ aug_proposals = torch.cat(recovered_proposals, dim=0)
+ merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+ aug_proposals[:, -1].contiguous(),
+ cfg.nms.iou_threshold)
+ scores = merged_proposals[:, 4]
+ _, order = scores.sort(0, descending=True)
+ num = min(cfg.max_per_img, merged_proposals.shape[0])
+ order = order[:num]
+ merged_proposals = merged_proposals[order, :]
+ return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+ """Merge augmented detection bboxes and scores.
+
+ Args:
+ aug_bboxes (list[Tensor]): shape (n, 4*#class)
+ aug_scores (list[Tensor] or None): shape (n, #class)
+ img_shapes (list[Tensor]): shape (3, ).
+ rcnn_test_cfg (dict): rcnn test config.
+
+ Returns:
+ tuple: (bboxes, scores)
+ """
+ recovered_bboxes = []
+ for bboxes, img_info in zip(aug_bboxes, img_metas):
+ img_shape = img_info[0]['img_shape']
+ scale_factor = img_info[0]['scale_factor']
+ flip = img_info[0]['flip']
+ flip_direction = img_info[0]['flip_direction']
+ bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+ flip_direction)
+ recovered_bboxes.append(bboxes)
+ bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+ if aug_scores is None:
+ return bboxes
+ else:
+ scores = torch.stack(aug_scores).mean(dim=0)
+ return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+ """Merge augmented bbox scores."""
+ if isinstance(aug_scores[0], torch.Tensor):
+ return torch.mean(torch.stack(aug_scores), dim=0)
+ else:
+ return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+ """Merge augmented mask prediction.
+
+ Args:
+ aug_masks (list[ndarray]): shape (n, #class, h, w)
+ img_shapes (list[ndarray]): shape (3, ).
+ rcnn_test_cfg (dict): rcnn test config.
+
+ Returns:
+ tuple: (bboxes, scores)
+ """
+ recovered_masks = []
+ for mask, img_info in zip(aug_masks, img_metas):
+ flip = img_info[0]['flip']
+ flip_direction = img_info[0]['flip_direction']
+ if flip:
+ if flip_direction == 'horizontal':
+ mask = mask[:, :, :, ::-1]
+ elif flip_direction == 'vertical':
+ mask = mask[:, :, ::-1, :]
+ elif flip_direction == 'diagonal':
+ mask = mask[:, :, :, ::-1]
+ mask = mask[:, :, ::-1, :]
+ else:
+ raise ValueError(
+ f"Invalid flipping direction '{flip_direction}'")
+ recovered_masks.append(mask)
+
+ if weights is None:
+ merged_masks = np.mean(recovered_masks, axis=0)
+ else:
+ merged_masks = np.average(
+ np.array(recovered_masks), axis=0, weights=np.array(weights))
+ return merged_masks
+
+def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
+ """Merge augmented detection 3D bboxes and scores.
+
+ Args:
+ aug_results (list[dict]): The dict of detection results.
+ The dict contains the following keys
+
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+ img_metas (list[dict]): Meta information of each sample.
+ test_cfg (dict): Test config.
+
+ Returns:
+ dict: Bounding boxes results in cpu mode, containing merged results.
+
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
+ - scores_3d (torch.Tensor): Merged detection scores.
+ - labels_3d (torch.Tensor): Merged predicted box labels.
+ """
+
+ assert len(aug_results) == len(img_metas), \
+ '"aug_results" should have the same length as "img_metas", got len(' \
+ f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
+
+ recovered_bboxes = []
+ recovered_scores = []
+ recovered_labels = []
+
+ for bboxes, img_info in zip(aug_results, img_metas):
+ scale_factor = img_info[0]['pcd_scale_factor']
+ pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip']
+ pcd_vertical_flip = img_info[0]['pcd_vertical_flip']
+ recovered_scores.append(bboxes['scores_3d'])
+ recovered_labels.append(bboxes['labels_3d'])
+ bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor,
+ pcd_horizontal_flip, pcd_vertical_flip)
+ recovered_bboxes.append(bboxes)
+
+ aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
+ aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
+ aug_scores = torch.cat(recovered_scores, dim=0)
+ aug_labels = torch.cat(recovered_labels, dim=0)
+
+ # TODO: use a more elegent way to deal with nms
+ if test_cfg.use_rotate_nms:
+ nms_func = nms_gpu
+ else:
+ nms_func = nms_normal_gpu
+
+ merged_bboxes = []
+ merged_scores = []
+ merged_labels = []
+
+ # Apply multi-class nms when merge bboxes
+ if len(aug_labels) == 0:
+ return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
+
+ for class_id in range(torch.max(aug_labels).item() + 1):
+ class_inds = (aug_labels == class_id)
+ bboxes_i = aug_bboxes[class_inds]
+ bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
+ scores_i = aug_scores[class_inds]
+ labels_i = aug_labels[class_inds]
+ if len(bboxes_nms_i) == 0:
+ continue
+ selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
+
+ merged_bboxes.append(bboxes_i[selected, :])
+ merged_scores.append(scores_i[selected])
+ merged_labels.append(labels_i[selected])
+
+ merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
+ merged_scores = torch.cat(merged_scores, dim=0)
+ merged_labels = torch.cat(merged_labels, dim=0)
+
+ _, order = merged_scores.sort(0, descending=True)
+ num = min(test_cfg.max_num, len(aug_bboxes))
+ order = order[:num]
+
+ merged_bboxes = merged_bboxes[order]
+ merged_scores = merged_scores[order]
+ merged_labels = merged_labels[order]
+
+ return bbox3d2result(merged_bboxes, merged_scores, merged_labels)
+
diff --git a/mmcv/core/utils/__init__.py b/mmcv/core/utils/__init__.py
new file mode 100644
index 0000000..b127388
--- /dev/null
+++ b/mmcv/core/utils/__init__.py
@@ -0,0 +1,9 @@
+from .dist_utils import DistOptimizerHook, allreduce_grads, reduce_mean
+from .misc import flip_tensor, mask2ndarray, multi_apply, unmap, add_prefix
+from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius
+
+__all__ = [
+ 'allreduce_grads', 'DistOptimizerHook', 'reduce_mean', 'multi_apply',
+ 'unmap', 'mask2ndarray', 'flip_tensor', 'add_prefix',
+ 'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian'
+]
diff --git a/mmcv/core/utils/dist_utils.py b/mmcv/core/utils/dist_utils.py
new file mode 100644
index 0000000..5fe7775
--- /dev/null
+++ b/mmcv/core/utils/dist_utils.py
@@ -0,0 +1,69 @@
+import warnings
+from collections import OrderedDict
+
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+ _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+ if bucket_size_mb > 0:
+ bucket_size_bytes = bucket_size_mb * 1024 * 1024
+ buckets = _take_tensors(tensors, bucket_size_bytes)
+ else:
+ buckets = OrderedDict()
+ for tensor in tensors:
+ tp = tensor.type()
+ if tp not in buckets:
+ buckets[tp] = []
+ buckets[tp].append(tensor)
+ buckets = buckets.values()
+
+ for bucket in buckets:
+ flat_tensors = _flatten_dense_tensors(bucket)
+ dist.all_reduce(flat_tensors)
+ flat_tensors.div_(world_size)
+ for tensor, synced in zip(
+ bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+ tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+ """Allreduce gradients.
+
+ Args:
+ params (list[torch.Parameters]): List of parameters of a model
+ coalesce (bool, optional): Whether allreduce parameters as a whole.
+ Defaults to True.
+ bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+ Defaults to -1.
+ """
+ grads = [
+ param.grad.data for param in params
+ if param.requires_grad and param.grad is not None
+ ]
+ world_size = dist.get_world_size()
+ if coalesce:
+ _allreduce_coalesced(grads, world_size, bucket_size_mb)
+ else:
+ for tensor in grads:
+ dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+ """Deprecated optimizer hook for distributed training."""
+
+ def __init__(self, *args, **kwargs):
+ warnings.warn('"DistOptimizerHook" is deprecated, please switch to'
+ '"mmcv.runner.OptimizerHook".')
+ super().__init__(*args, **kwargs)
+
+
+def reduce_mean(tensor):
+ """"Obtain the mean of tensor on different GPUs."""
+ if not (dist.is_available() and dist.is_initialized()):
+ return tensor
+ tensor = tensor.clone()
+ dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+ return tensor
diff --git a/mmcv/core/utils/gaussian.py b/mmcv/core/utils/gaussian.py
new file mode 100644
index 0000000..a07963e
--- /dev/null
+++ b/mmcv/core/utils/gaussian.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def gaussian_2d(shape, sigma=1):
+ """Generate gaussian map.
+
+ Args:
+ shape (list[int]): Shape of the map.
+ sigma (float): Sigma to generate gaussian map.
+ Defaults to 1.
+
+ Returns:
+ np.ndarray: Generated gaussian map.
+ """
+ m, n = [(ss - 1.) / 2. for ss in shape]
+ y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+ h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+ h[h < np.finfo(h.dtype).eps * h.max()] = 0
+ return h
+
+
+def draw_heatmap_gaussian(heatmap, center, radius, k=1):
+ """Get gaussian masked heatmap.
+
+ Args:
+ heatmap (torch.Tensor): Heatmap to be masked.
+ center (torch.Tensor): Center coord of the heatmap.
+ radius (int): Radius of gausian.
+ K (int): Multiple of masked_gaussian. Defaults to 1.
+
+ Returns:
+ torch.Tensor: Masked heatmap.
+ """
+ diameter = 2 * radius + 1
+ gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
+
+ x, y = int(center[0]), int(center[1])
+
+ height, width = heatmap.shape[0:2]
+
+ left, right = min(x, radius), min(width - x, radius + 1)
+ top, bottom = min(y, radius), min(height - y, radius + 1)
+
+ masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+ masked_gaussian = torch.from_numpy(
+ gaussian[radius - top:radius + bottom,
+ radius - left:radius + right]).to(heatmap.device,
+ torch.float32)
+ if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+ torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+ return heatmap
+
+
+def gaussian_radius(det_size, min_overlap=0.5):
+ """Get radius of gaussian.
+
+ Args:
+ det_size (tuple[torch.Tensor]): Size of the detection result.
+ min_overlap (float): Gaussian_overlap. Defaults to 0.5.
+
+ Returns:
+ torch.Tensor: Computed radius.
+ """
+ height, width = det_size
+
+ a1 = 1
+ b1 = (height + width)
+ c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+ sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
+ r1 = (b1 + sq1) / 2
+
+ a2 = 4
+ b2 = 2 * (height + width)
+ c2 = (1 - min_overlap) * width * height
+ sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
+ r2 = (b2 + sq2) / 2
+
+ a3 = 4 * min_overlap
+ b3 = -2 * min_overlap * (height + width)
+ c3 = (min_overlap - 1) * width * height
+ sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
+ r3 = (b3 + sq3) / 2
+ return min(r1, r2, r3)
diff --git a/mmcv/core/utils/misc.py b/mmcv/core/utils/misc.py
new file mode 100644
index 0000000..52e1897
--- /dev/null
+++ b/mmcv/core/utils/misc.py
@@ -0,0 +1,102 @@
+from functools import partial
+
+import numpy as np
+import torch
+from six.moves import map, zip
+
+from ..mask.structures import BitmapMasks, PolygonMasks
+
+
+def multi_apply(func, *args, **kwargs):
+ """Apply function to a list of arguments.
+
+ Note:
+ This function applies the ``func`` to multiple inputs and
+ map the multiple outputs of the ``func`` into different
+ list. Each list contains the same type of outputs corresponding
+ to different inputs.
+
+ Args:
+ func (Function): A function that will be applied to a list of
+ arguments
+
+ Returns:
+ tuple(list): A tuple containing multiple list, each list contains \
+ a kind of returned results by the function
+ """
+ pfunc = partial(func, **kwargs) if kwargs else func
+ map_results = map(pfunc, *args)
+ return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+ """Unmap a subset of item (data) back to the original set of items (of size
+ count)"""
+ if data.dim() == 1:
+ ret = data.new_full((count, ), fill)
+ ret[inds.type(torch.bool)] = data
+ else:
+ new_size = (count, ) + data.size()[1:]
+ ret = data.new_full(new_size, fill)
+ ret[inds.type(torch.bool), :] = data
+ return ret
+
+
+def mask2ndarray(mask):
+ """Convert Mask to ndarray..
+
+ Args:
+ mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or
+ torch.Tensor or np.ndarray): The mask to be converted.
+
+ Returns:
+ np.ndarray: Ndarray mask of shape (n, h, w) that has been converted
+ """
+ if isinstance(mask, (BitmapMasks, PolygonMasks)):
+ mask = mask.to_ndarray()
+ elif isinstance(mask, torch.Tensor):
+ mask = mask.detach().cpu().numpy()
+ elif not isinstance(mask, np.ndarray):
+ raise TypeError(f'Unsupported {type(mask)} data type')
+ return mask
+
+
+def flip_tensor(src_tensor, flip_direction):
+ """flip tensor base on flip_direction.
+
+ Args:
+ src_tensor (Tensor): input feature map, shape (B, C, H, W).
+ flip_direction (str): The flipping direction. Options are
+ 'horizontal', 'vertical', 'diagonal'.
+
+ Returns:
+ out_tensor (Tensor): Flipped tensor.
+ """
+ assert src_tensor.ndim == 4
+ valid_directions = ['horizontal', 'vertical', 'diagonal']
+ assert flip_direction in valid_directions
+ if flip_direction == 'horizontal':
+ out_tensor = torch.flip(src_tensor, [3])
+ elif flip_direction == 'vertical':
+ out_tensor = torch.flip(src_tensor, [2])
+ else:
+ out_tensor = torch.flip(src_tensor, [2, 3])
+ return out_tensor
+
+def add_prefix(inputs, prefix):
+ """Add prefix for dict.
+
+ Args:
+ inputs (dict): The input dict with str keys.
+ prefix (str): The prefix to add.
+
+ Returns:
+
+ dict: The dict with keys updated with ``prefix``.
+ """
+
+ outputs = dict()
+ for name, value in inputs.items():
+ outputs[f'{prefix}.{name}'] = value
+
+ return outputs
diff --git a/mmcv/core/visualization/__init__.py b/mmcv/core/visualization/__init__.py
new file mode 100644
index 0000000..af64628
--- /dev/null
+++ b/mmcv/core/visualization/__init__.py
@@ -0,0 +1,4 @@
+from .image import (color_val_matplotlib, imshow_det_bboxes,
+ imshow_gt_det_bboxes, imshow, color_val)
+
+__all__ = ['imshow_det_bboxes', 'imshow_gt_det_bboxes', 'color_val_matplotlib']
diff --git a/mmcv/core/visualization/image.py b/mmcv/core/visualization/image.py
new file mode 100644
index 0000000..a001853
--- /dev/null
+++ b/mmcv/core/visualization/image.py
@@ -0,0 +1,372 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import cv2
+import pycocotools.mask as mask_util
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+from enum import Enum
+from mmcv.utils import concat_list, is_str
+from mmcv.image import imread, imwrite
+from mmcv.image import bgr2rgb, rgb2bgr
+from ..utils import mask2ndarray
+
+EPS = 1e-2
+
+
+class Color(Enum):
+ """An enum that defines common colors.
+
+ Contains red, green, blue, cyan, yellow, magenta, white and black.
+ """
+ red = (0, 0, 255)
+ green = (0, 255, 0)
+ blue = (255, 0, 0)
+ cyan = (255, 255, 0)
+ yellow = (0, 255, 255)
+ magenta = (255, 0, 255)
+ white = (255, 255, 255)
+ black = (0, 0, 0)
+
+
+def color_val(color):
+ """Convert various input to color tuples.
+
+ Args:
+ color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+ Returns:
+ tuple[int]: A tuple of 3 integers indicating BGR channels.
+ """
+ if is_str(color):
+ return Color[color].value
+ elif isinstance(color, Color):
+ return color.value
+ elif isinstance(color, tuple):
+ assert len(color) == 3
+ for channel in color:
+ assert 0 <= channel <= 255
+ return color
+ elif isinstance(color, int):
+ assert 0 <= color <= 255
+ return color, color, color
+ elif isinstance(color, np.ndarray):
+ assert color.ndim == 1 and color.size == 3
+ assert np.all((color >= 0) & (color <= 255))
+ color = color.astype(np.uint8)
+ return tuple(color)
+ else:
+ raise TypeError(f'Invalid type for color: {type(color)}')
+
+
+
+def color_val_matplotlib(color):
+ """Convert various input in BGR order to normalized RGB matplotlib color
+ tuples,
+
+ Args:
+ color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+ Returns:
+ tuple[float]: A tuple of 3 normalized floats indicating RGB channels.
+ """
+ color = color_val(color)
+ color = [color / 255 for color in color[::-1]]
+ return tuple(color)
+
+def imshow(img, win_name='', wait_time=0):
+ """Show an image.
+
+ Args:
+ img (str or ndarray): The image to be displayed.
+ win_name (str): The window name.
+ wait_time (int): Value of waitKey param.
+ """
+ cv2.imshow(win_name, imread(img))
+ if wait_time == 0: # prevent from hanging if windows was closed
+ while True:
+ ret = cv2.waitKey(1)
+
+ closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
+ # if user closed window or if some key pressed
+ if closed or ret != -1:
+ break
+ else:
+ ret = cv2.waitKey(wait_time)
+
+
+def imshow_det_bboxes(img,
+ bboxes,
+ labels,
+ segms=None,
+ class_names=None,
+ score_thr=0,
+ bbox_color='green',
+ text_color='green',
+ mask_color=None,
+ thickness=2,
+ font_size=13,
+ win_name='',
+ show=True,
+ wait_time=0,
+ out_file=None):
+ """Draw bboxes and class labels (with scores) on an image.
+
+ Args:
+ img (str or ndarray): The image to be displayed.
+ bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+ (n, 5).
+ labels (ndarray): Labels of bboxes.
+ segms (ndarray or None): Masks, shaped (n,h,w) or None
+ class_names (list[str]): Names of each classes.
+ score_thr (float): Minimum score of bboxes to be shown. Default: 0
+ bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+ The tuple of color should be in BGR order. Default: 'green'
+ text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+ The tuple of color should be in BGR order. Default: 'green'
+ mask_color (str or tuple(int) or :obj:`Color`, optional):
+ Color of masks. The tuple of color should be in BGR order.
+ Default: None
+ thickness (int): Thickness of lines. Default: 2
+ font_size (int): Font size of texts. Default: 13
+ show (bool): Whether to show the image. Default: True
+ win_name (str): The window name. Default: ''
+ wait_time (float): Value of waitKey param. Default: 0.
+ out_file (str, optional): The filename to write the image.
+ Default: None
+
+ Returns:
+ ndarray: The image with bboxes drawn on it.
+ """
+ assert bboxes.ndim == 2, \
+ f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+ assert labels.ndim == 1, \
+ f' labels ndim should be 1, but its ndim is {labels.ndim}.'
+ assert bboxes.shape[0] == labels.shape[0], \
+ 'bboxes.shape[0] and labels.shape[0] should have the same length.'
+ assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
+ f' bboxes.shape[1] should be 4 or 5, but its {bboxes.shape[1]}.'
+ img = imread(img).astype(np.uint8)
+
+ if score_thr > 0:
+ assert bboxes.shape[1] == 5
+ scores = bboxes[:, -1]
+ inds = scores > score_thr
+ bboxes = bboxes[inds, :]
+ labels = labels[inds]
+ if segms is not None:
+ segms = segms[inds, ...]
+
+ mask_colors = []
+ if labels.shape[0] > 0:
+ if mask_color is None:
+ # random color
+ np.random.seed(42)
+ mask_colors = [
+ np.random.randint(0, 256, (1, 3), dtype=np.uint8)
+ for _ in range(max(labels) + 1)
+ ]
+ else:
+ # specify color
+ mask_colors = [
+ np.array(color_val(mask_color)[::-1], dtype=np.uint8)
+ ] * (
+ max(labels) + 1)
+
+ bbox_color = color_val_matplotlib(bbox_color)
+ text_color = color_val_matplotlib(text_color)
+
+ img = bgr2rgb(img)
+ width, height = img.shape[1], img.shape[0]
+ img = np.ascontiguousarray(img)
+
+ fig = plt.figure(win_name, frameon=False)
+ plt.title(win_name)
+ canvas = fig.canvas
+ dpi = fig.get_dpi()
+ # add a small EPS to avoid precision lost due to matplotlib's truncation
+ # (https://github.com/matplotlib/matplotlib/issues/15363)
+ fig.set_size_inches((width + EPS) / dpi, (height + EPS) / dpi)
+
+ # remove white edges by set subplot margin
+ plt.subplots_adjust(left=0, right=1, bottom=0, top=1)
+ ax = plt.gca()
+ ax.axis('off')
+
+ polygons = []
+ color = []
+ for i, (bbox, label) in enumerate(zip(bboxes, labels)):
+ bbox_int = bbox.astype(np.int32)
+ poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
+ [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
+ np_poly = np.array(poly).reshape((4, 2))
+ polygons.append(Polygon(np_poly))
+ color.append(bbox_color)
+ label_text = class_names[
+ label] if class_names is not None else f'class {label}'
+ if len(bbox) > 4:
+ label_text += f'|{bbox[-1]:.02f}'
+ ax.text(
+ bbox_int[0],
+ bbox_int[1],
+ f'{label_text}',
+ bbox={
+ 'facecolor': 'black',
+ 'alpha': 0.8,
+ 'pad': 0.7,
+ 'edgecolor': 'none'
+ },
+ color=text_color,
+ fontsize=font_size,
+ verticalalignment='top',
+ horizontalalignment='left')
+ if segms is not None:
+ color_mask = mask_colors[labels[i]]
+ mask = segms[i].astype(bool)
+ img[mask] = img[mask] * 0.5 + color_mask * 0.5
+
+ plt.imshow(img)
+
+ p = PatchCollection(
+ polygons, facecolor='none', edgecolors=color, linewidths=thickness)
+ ax.add_collection(p)
+
+ stream, _ = canvas.print_to_buffer()
+ buffer = np.frombuffer(stream, dtype='uint8')
+ img_rgba = buffer.reshape(height, width, 4)
+ rgb, alpha = np.split(img_rgba, [3], axis=2)
+ img = rgb.astype('uint8')
+ img = rgb2bgr(img)
+
+ if show:
+ # We do not use cv2 for display because in some cases, opencv will
+ # conflict with Qt, it will output a warning: Current thread
+ # is not the object's thread. You can refer to
+ # https://github.com/opencv/opencv-python/issues/46 for details
+ if wait_time == 0:
+ plt.show()
+ else:
+ plt.show(block=False)
+ plt.pause(wait_time)
+ if out_file is not None:
+ imwrite(img, out_file)
+
+ plt.close()
+
+ return img
+
+
+def imshow_gt_det_bboxes(img,
+ annotation,
+ result,
+ class_names=None,
+ score_thr=0,
+ gt_bbox_color=(255, 102, 61),
+ gt_text_color=(255, 102, 61),
+ gt_mask_color=(255, 102, 61),
+ det_bbox_color=(72, 101, 241),
+ det_text_color=(72, 101, 241),
+ det_mask_color=(72, 101, 241),
+ thickness=2,
+ font_size=13,
+ win_name='',
+ show=True,
+ wait_time=0,
+ out_file=None):
+ """General visualization GT and result function.
+
+ Args:
+ img (str or ndarray): The image to be displayed.)
+ annotation (dict): Ground truth annotations where contain keys of
+ 'gt_bboxes' and 'gt_labels' or 'gt_masks'
+ result (tuple[list] or list): The detection result, can be either
+ (bbox, segm) or just bbox.
+ class_names (list[str]): Names of each classes.
+ score_thr (float): Minimum score of bboxes to be shown. Default: 0
+ gt_bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+ The tuple of color should be in BGR order. Default: (255, 102, 61)
+ gt_text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+ The tuple of color should be in BGR order. Default: (255, 102, 61)
+ gt_mask_color (str or tuple(int) or :obj:`Color`, optional):
+ Color of masks. The tuple of color should be in BGR order.
+ Default: (255, 102, 61)
+ det_bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+ The tuple of color should be in BGR order. Default: (72, 101, 241)
+ det_text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+ The tuple of color should be in BGR order. Default: (72, 101, 241)
+ det_mask_color (str or tuple(int) or :obj:`Color`, optional):
+ Color of masks. The tuple of color should be in BGR order.
+ Default: (72, 101, 241)
+ thickness (int): Thickness of lines. Default: 2
+ font_size (int): Font size of texts. Default: 13
+ win_name (str): The window name. Default: ''
+ show (bool): Whether to show the image. Default: True
+ wait_time (float): Value of waitKey param. Default: 0.
+ out_file (str, optional): The filename to write the image.
+ Default: None
+
+ Returns:
+ ndarray: The image with bboxes or masks drawn on it.
+ """
+ assert 'gt_bboxes' in annotation
+ assert 'gt_labels' in annotation
+ assert isinstance(
+ result,
+ (tuple, list)), f'Expected tuple or list, but get {type(result)}'
+
+ gt_masks = annotation.get('gt_masks', None)
+ if gt_masks is not None:
+ gt_masks = mask2ndarray(gt_masks)
+
+ img = imread(img)
+
+ img = imshow_det_bboxes(
+ img,
+ annotation['gt_bboxes'],
+ annotation['gt_labels'],
+ gt_masks,
+ class_names=class_names,
+ bbox_color=gt_bbox_color,
+ text_color=gt_text_color,
+ mask_color=gt_mask_color,
+ thickness=thickness,
+ font_size=font_size,
+ win_name=win_name,
+ show=False)
+
+ if isinstance(result, tuple):
+ bbox_result, segm_result = result
+ if isinstance(segm_result, tuple):
+ segm_result = segm_result[0] # ms rcnn
+ else:
+ bbox_result, segm_result = result, None
+
+ bboxes = np.vstack(bbox_result)
+ labels = [
+ np.full(bbox.shape[0], i, dtype=np.int32)
+ for i, bbox in enumerate(bbox_result)
+ ]
+ labels = np.concatenate(labels)
+
+ segms = None
+ if segm_result is not None and len(labels) > 0: # non empty
+ segms = concat_list(segm_result)
+ segms = mask_util.decode(segms)
+ segms = segms.transpose(2, 0, 1)
+
+ img = imshow_det_bboxes(
+ img,
+ bboxes,
+ labels,
+ segms=segms,
+ class_names=class_names,
+ score_thr=score_thr,
+ bbox_color=det_bbox_color,
+ text_color=det_text_color,
+ mask_color=det_mask_color,
+ thickness=thickness,
+ font_size=font_size,
+ win_name=win_name,
+ show=show,
+ wait_time=wait_time,
+ out_file=out_file)
+ return img
diff --git a/mmcv/core/visualizer/__init__.py b/mmcv/core/visualizer/__init__.py
new file mode 100644
index 0000000..bbf1e60
--- /dev/null
+++ b/mmcv/core/visualizer/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .show_result import (show_multi_modality_result, show_result,
+ show_seg_result)
+
+__all__ = ['show_result', 'show_seg_result', 'show_multi_modality_result']
diff --git a/mmcv/core/visualizer/image_vis.py b/mmcv/core/visualizer/image_vis.py
new file mode 100644
index 0000000..60034f1
--- /dev/null
+++ b/mmcv/core/visualizer/image_vis.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import cv2
+import numpy as np
+import torch
+from matplotlib import pyplot as plt
+
+
+def project_pts_on_img(points,
+ raw_img,
+ lidar2img_rt,
+ max_distance=70,
+ thickness=-1):
+ """Project the 3D points cloud on 2D image.
+
+ Args:
+ points (numpy.array): 3D points cloud (x, y, z) to visualize.
+ raw_img (numpy.array): The numpy array of image.
+ lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+ according to the camera intrinsic parameters.
+ max_distance (float): the max distance of the points cloud.
+ Default: 70.
+ thickness (int, optional): The thickness of 2D points. Default: -1.
+ """
+ img = raw_img.copy()
+ num_points = points.shape[0]
+ pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1)
+ pts_2d = pts_4d @ lidar2img_rt.T
+
+ # cam_points is Tensor of Nx4 whose last column is 1
+ # transform camera coordinate to image coordinate
+ pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999)
+ pts_2d[:, 0] /= pts_2d[:, 2]
+ pts_2d[:, 1] /= pts_2d[:, 2]
+
+ fov_inds = ((pts_2d[:, 0] < img.shape[1])
+ & (pts_2d[:, 0] >= 0)
+ & (pts_2d[:, 1] < img.shape[0])
+ & (pts_2d[:, 1] >= 0))
+
+ imgfov_pts_2d = pts_2d[fov_inds, :3] # u, v, d
+
+ cmap = plt.cm.get_cmap('hsv', 256)
+ cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255
+ for i in range(imgfov_pts_2d.shape[0]):
+ depth = imgfov_pts_2d[i, 2]
+ color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :]
+ cv2.circle(
+ img,
+ center=(int(np.round(imgfov_pts_2d[i, 0])),
+ int(np.round(imgfov_pts_2d[i, 1]))),
+ radius=1,
+ color=tuple(color),
+ thickness=thickness,
+ )
+ cv2.imshow('project_pts_img', img.astype(np.uint8))
+ cv2.waitKey(100)
+
+
+def plot_rect3d_on_img(img,
+ num_rects,
+ rect_corners,
+ color=(0, 255, 0),
+ thickness=1):
+ """Plot the boundary lines of 3D rectangular on 2D images.
+
+ Args:
+ img (numpy.array): The numpy array of image.
+ num_rects (int): Number of 3D rectangulars.
+ rect_corners (numpy.array): Coordinates of the corners of 3D
+ rectangulars. Should be in the shape of [num_rect, 8, 2].
+ color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+ thickness (int, optional): The thickness of bboxes. Default: 1.
+ """
+ line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
+ (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
+ for i in range(num_rects):
+ corners = rect_corners[i].astype(np.int)
+ for start, end in line_indices:
+ cv2.line(img, (corners[start, 0], corners[start, 1]),
+ (corners[end, 0], corners[end, 1]), color, thickness,
+ cv2.LINE_AA)
+
+ return img.astype(np.uint8)
+
+
+def draw_lidar_bbox3d_on_img(bboxes3d,
+ raw_img,
+ lidar2img_rt,
+ img_metas,
+ color=(0, 255, 0),
+ thickness=1):
+ """Project the 3D bbox on 2D plane and draw on input image.
+
+ Args:
+ bboxes3d (:obj:`LiDARInstance3DBoxes`):
+ 3d bbox in lidar coordinate system to visualize.
+ raw_img (numpy.array): The numpy array of image.
+ lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+ according to the camera intrinsic parameters.
+ img_metas (dict): Useless here.
+ color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+ thickness (int, optional): The thickness of bboxes. Default: 1.
+ """
+ img = raw_img.copy()
+ corners_3d = bboxes3d.corners
+ num_bbox = corners_3d.shape[0]
+ pts_4d = np.concatenate(
+ [corners_3d.reshape(-1, 3),
+ np.ones((num_bbox * 8, 1))], axis=-1)
+ lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4)
+ if isinstance(lidar2img_rt, torch.Tensor):
+ lidar2img_rt = lidar2img_rt.cpu().numpy()
+ pts_2d = pts_4d @ lidar2img_rt.T
+
+ pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
+ pts_2d[:, 0] /= pts_2d[:, 2]
+ pts_2d[:, 1] /= pts_2d[:, 2]
+ imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
+
+ return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+
+
+# TODO: remove third parameter in all functions here in favour of img_metas
+def draw_depth_bbox3d_on_img(bboxes3d,
+ raw_img,
+ calibs,
+ img_metas,
+ color=(0, 255, 0),
+ thickness=1):
+ """Project the 3D bbox on 2D plane and draw on input image.
+
+ Args:
+ bboxes3d (:obj:`DepthInstance3DBoxes`, shape=[M, 7]):
+ 3d bbox in depth coordinate system to visualize.
+ raw_img (numpy.array): The numpy array of image.
+ calibs (dict): Camera calibration information, Rt and K.
+ img_metas (dict): Used in coordinates transformation.
+ color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+ thickness (int, optional): The thickness of bboxes. Default: 1.
+ """
+ from mmcv.core.bbox import points_cam2img
+ from mmcv.models import apply_3d_transformation
+
+ img = raw_img.copy()
+ img_metas = copy.deepcopy(img_metas)
+ corners_3d = bboxes3d.corners
+ num_bbox = corners_3d.shape[0]
+ points_3d = corners_3d.reshape(-1, 3)
+
+ # first reverse the data transformations
+ xyz_depth = apply_3d_transformation(
+ points_3d, 'DEPTH', img_metas, reverse=True)
+
+ # project to 2d to get image coords (uv)
+ uv_origin = points_cam2img(xyz_depth,
+ xyz_depth.new_tensor(img_metas['depth2img']))
+ uv_origin = (uv_origin - 1).round()
+ imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+ return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+
+
+def draw_camera_bbox3d_on_img(bboxes3d,
+ raw_img,
+ cam2img,
+ img_metas,
+ color=(0, 255, 0),
+ thickness=1):
+ """Project the 3D bbox on 2D plane and draw on input image.
+
+ Args:
+ bboxes3d (:obj:`CameraInstance3DBoxes`, shape=[M, 7]):
+ 3d bbox in camera coordinate system to visualize.
+ raw_img (numpy.array): The numpy array of image.
+ cam2img (dict): Camera intrinsic matrix,
+ denoted as `K` in depth bbox coordinate system.
+ img_metas (dict): Useless here.
+ color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+ thickness (int, optional): The thickness of bboxes. Default: 1.
+ """
+ from mmcv.core.bbox import points_cam2img
+
+ img = raw_img.copy()
+ cam2img = copy.deepcopy(cam2img)
+ corners_3d = bboxes3d.corners
+ num_bbox = corners_3d.shape[0]
+ points_3d = corners_3d.reshape(-1, 3)
+ if not isinstance(cam2img, torch.Tensor):
+ cam2img = torch.from_numpy(np.array(cam2img))
+ cam2img = cam2img.reshape(3, 3).float().cpu()
+
+ # project to 2d to get image coords (uv)
+ uv_origin = points_cam2img(points_3d, cam2img)
+ uv_origin = (uv_origin - 1).round()
+ imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+ return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
diff --git a/mmcv/core/visualizer/open3d_vis.py b/mmcv/core/visualizer/open3d_vis.py
new file mode 100644
index 0000000..0790ee4
--- /dev/null
+++ b/mmcv/core/visualizer/open3d_vis.py
@@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import numpy as np
+import torch
+
+try:
+ import open3d as o3d
+ from open3d import geometry
+except ImportError:
+ raise ImportError(
+ 'Please run "pip install open3d" to install open3d first.')
+
+
+def _draw_points(points,
+ vis,
+ points_size=2,
+ point_color=(0.5, 0.5, 0.5),
+ mode='xyz'):
+ """Draw points on visualizer.
+
+ Args:
+ points (numpy.array | torch.tensor, shape=[N, 3+C]):
+ points to visualize.
+ vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+ points_size (int): the size of points to show on visualizer.
+ Default: 2.
+ point_color (tuple[float]): the color of points.
+ Default: (0.5, 0.5, 0.5).
+ mode (str): indicate type of the input points, avaliable mode
+ ['xyz', 'xyzrgb']. Default: 'xyz'.
+
+ Returns:
+ tuple: points, color of each point.
+ """
+ vis.get_render_option().point_size = points_size # set points size
+ if isinstance(points, torch.Tensor):
+ points = points.cpu().numpy()
+
+ points = points.copy()
+ pcd = geometry.PointCloud()
+ if mode == 'xyz':
+ pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+ points_colors = np.tile(np.array(point_color), (points.shape[0], 1))
+ elif mode == 'xyzrgb':
+ pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+ points_colors = points[:, 3:6]
+ # normalize to [0, 1] for open3d drawing
+ if not ((points_colors >= 0.0) & (points_colors <= 1.0)).all():
+ points_colors /= 255.0
+ else:
+ raise NotImplementedError
+
+ pcd.colors = o3d.utility.Vector3dVector(points_colors)
+ vis.add_geometry(pcd)
+
+ return pcd, points_colors
+
+
+def _draw_bboxes(bbox3d,
+ vis,
+ points_colors,
+ pcd=None,
+ bbox_color=(0, 1, 0),
+ points_in_box_color=(1, 0, 0),
+ rot_axis=2,
+ center_mode='lidar_bottom',
+ mode='xyz'):
+ """Draw bbox on visualizer and change the color of points inside bbox3d.
+
+ Args:
+ bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+ 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
+ vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+ points_colors (numpy.array): color of each points.
+ pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None.
+ bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+ points_in_box_color (tuple[float]):
+ the color of points inside bbox3d. Default: (1, 0, 0).
+ rot_axis (int): rotation axis of bbox. Default: 2.
+ center_mode (bool): indicate the center of bbox is bottom center
+ or gravity center. avaliable mode
+ ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+ mode (str): indicate type of the input points, avaliable mode
+ ['xyz', 'xyzrgb']. Default: 'xyz'.
+ """
+ if isinstance(bbox3d, torch.Tensor):
+ bbox3d = bbox3d.cpu().numpy()
+ bbox3d = bbox3d.copy()
+
+ in_box_color = np.array(points_in_box_color)
+ for i in range(len(bbox3d)):
+ center = bbox3d[i, 0:3]
+ dim = bbox3d[i, 3:6]
+ yaw = np.zeros(3)
+ yaw[rot_axis] = -bbox3d[i, 6]
+ rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+
+ if center_mode == 'lidar_bottom':
+ center[rot_axis] += dim[
+ rot_axis] / 2 # bottom center to gravity center
+ elif center_mode == 'camera_bottom':
+ center[rot_axis] -= dim[
+ rot_axis] / 2 # bottom center to gravity center
+ box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+ line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
+ line_set.paint_uniform_color(bbox_color)
+ # draw bboxes on visualizer
+ vis.add_geometry(line_set)
+
+ # change the color of points which are in box
+ if pcd is not None and mode == 'xyz':
+ indices = box3d.get_point_indices_within_bounding_box(pcd.points)
+ points_colors[indices] = in_box_color
+
+ # update points colors
+ if pcd is not None:
+ pcd.colors = o3d.utility.Vector3dVector(points_colors)
+ vis.update_geometry(pcd)
+
+
+def show_pts_boxes(points,
+ bbox3d=None,
+ show=True,
+ save_path=None,
+ points_size=2,
+ point_color=(0.5, 0.5, 0.5),
+ bbox_color=(0, 1, 0),
+ points_in_box_color=(1, 0, 0),
+ rot_axis=2,
+ center_mode='lidar_bottom',
+ mode='xyz'):
+ """Draw bbox and points on visualizer.
+
+ Args:
+ points (numpy.array | torch.tensor, shape=[N, 3+C]):
+ points to visualize.
+ bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+ 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None.
+ show (bool): whether to show the visualization results. Default: True.
+ save_path (str): path to save visualized results. Default: None.
+ points_size (int): the size of points to show on visualizer.
+ Default: 2.
+ point_color (tuple[float]): the color of points.
+ Default: (0.5, 0.5, 0.5).
+ bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+ points_in_box_color (tuple[float]):
+ the color of points which are in bbox3d. Default: (1, 0, 0).
+ rot_axis (int): rotation axis of bbox. Default: 2.
+ center_mode (bool): indicate the center of bbox is bottom center
+ or gravity center. avaliable mode
+ ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+ mode (str): indicate type of the input points, avaliable mode
+ ['xyz', 'xyzrgb']. Default: 'xyz'.
+ """
+ # TODO: support score and class info
+ assert 0 <= rot_axis <= 2
+
+ # init visualizer
+ vis = o3d.visualization.Visualizer()
+ vis.create_window()
+ mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+ size=1, origin=[0, 0, 0]) # create coordinate frame
+ vis.add_geometry(mesh_frame)
+
+ # draw points
+ pcd, points_colors = _draw_points(points, vis, points_size, point_color,
+ mode)
+
+ # draw boxes
+ if bbox3d is not None:
+ _draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color,
+ points_in_box_color, rot_axis, center_mode, mode)
+
+ if show:
+ vis.run()
+
+ if save_path is not None:
+ vis.capture_screen_image(save_path)
+
+ vis.destroy_window()
+
+
+def _draw_bboxes_ind(bbox3d,
+ vis,
+ indices,
+ points_colors,
+ pcd=None,
+ bbox_color=(0, 1, 0),
+ points_in_box_color=(1, 0, 0),
+ rot_axis=2,
+ center_mode='lidar_bottom',
+ mode='xyz'):
+ """Draw bbox on visualizer and change the color or points inside bbox3d
+ with indices.
+
+ Args:
+ bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+ 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
+ vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+ indices (numpy.array | torch.tensor, shape=[N, M]):
+ indicate which bbox3d that each point lies in.
+ points_colors (numpy.array): color of each points.
+ pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None.
+ bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+ points_in_box_color (tuple[float]):
+ the color of points which are in bbox3d. Default: (1, 0, 0).
+ rot_axis (int): rotation axis of bbox. Default: 2.
+ center_mode (bool): indicate the center of bbox is bottom center
+ or gravity center. avaliable mode
+ ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+ mode (str): indicate type of the input points, avaliable mode
+ ['xyz', 'xyzrgb']. Default: 'xyz'.
+ """
+ if isinstance(bbox3d, torch.Tensor):
+ bbox3d = bbox3d.cpu().numpy()
+ if isinstance(indices, torch.Tensor):
+ indices = indices.cpu().numpy()
+ bbox3d = bbox3d.copy()
+
+ in_box_color = np.array(points_in_box_color)
+ for i in range(len(bbox3d)):
+ center = bbox3d[i, 0:3]
+ dim = bbox3d[i, 3:6]
+ yaw = np.zeros(3)
+ # TODO: fix problem of current coordinate system
+ # dim[0], dim[1] = dim[1], dim[0] # for current coordinate
+ # yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi)
+ yaw[rot_axis] = -bbox3d[i, 6]
+ rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+ if center_mode == 'lidar_bottom':
+ center[rot_axis] += dim[
+ rot_axis] / 2 # bottom center to gravity center
+ elif center_mode == 'camera_bottom':
+ center[rot_axis] -= dim[
+ rot_axis] / 2 # bottom center to gravity center
+ box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+ line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
+ line_set.paint_uniform_color(bbox_color)
+ # draw bboxes on visualizer
+ vis.add_geometry(line_set)
+
+ # change the color of points which are in box
+ if pcd is not None and mode == 'xyz':
+ points_colors[indices[:, i].astype(np.bool)] = in_box_color
+
+ # update points colors
+ if pcd is not None:
+ pcd.colors = o3d.utility.Vector3dVector(points_colors)
+ vis.update_geometry(pcd)
+
+
+def show_pts_index_boxes(points,
+ bbox3d=None,
+ show=True,
+ indices=None,
+ save_path=None,
+ points_size=2,
+ point_color=(0.5, 0.5, 0.5),
+ bbox_color=(0, 1, 0),
+ points_in_box_color=(1, 0, 0),
+ rot_axis=2,
+ center_mode='lidar_bottom',
+ mode='xyz'):
+ """Draw bbox and points on visualizer with indices that indicate which
+ bbox3d that each point lies in.
+
+ Args:
+ points (numpy.array | torch.tensor, shape=[N, 3+C]):
+ points to visualize.
+ bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+ 3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None.
+ show (bool): whether to show the visualization results. Default: True.
+ indices (numpy.array | torch.tensor, shape=[N, M]):
+ indicate which bbox3d that each point lies in. Default: None.
+ save_path (str): path to save visualized results. Default: None.
+ points_size (int): the size of points to show on visualizer.
+ Default: 2.
+ point_color (tuple[float]): the color of points.
+ Default: (0.5, 0.5, 0.5).
+ bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+ points_in_box_color (tuple[float]):
+ the color of points which are in bbox3d. Default: (1, 0, 0).
+ rot_axis (int): rotation axis of bbox. Default: 2.
+ center_mode (bool): indicate the center of bbox is bottom center
+ or gravity center. avaliable mode
+ ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+ mode (str): indicate type of the input points, avaliable mode
+ ['xyz', 'xyzrgb']. Default: 'xyz'.
+ """
+ # TODO: support score and class info
+ assert 0 <= rot_axis <= 2
+
+ # init visualizer
+ vis = o3d.visualization.Visualizer()
+ vis.create_window()
+ mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+ size=1, origin=[0, 0, 0]) # create coordinate frame
+ vis.add_geometry(mesh_frame)
+
+ # draw points
+ pcd, points_colors = _draw_points(points, vis, points_size, point_color,
+ mode)
+
+ # draw boxes
+ if bbox3d is not None:
+ _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color,
+ points_in_box_color, rot_axis, center_mode, mode)
+
+ if show:
+ vis.run()
+
+ if save_path is not None:
+ vis.capture_screen_image(save_path)
+
+ vis.destroy_window()
+
+
+class Visualizer(object):
+ r"""Online visualizer implemented with Open3d.
+
+ Args:
+ points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points
+ cloud is in mode of Coord3DMode.DEPTH (please refer to
+ core.structures.coord_3d_mode).
+ bbox3d (numpy.array, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw)
+ to visualize. The 3d bbox is in mode of Box3DMode.DEPTH with
+ gravity_center (please refer to core.structures.box_3d_mode).
+ Default: None.
+ save_path (str): path to save visualized results. Default: None.
+ points_size (int): the size of points to show on visualizer.
+ Default: 2.
+ point_color (tuple[float]): the color of points.
+ Default: (0.5, 0.5, 0.5).
+ bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+ points_in_box_color (tuple[float]):
+ the color of points which are in bbox3d. Default: (1, 0, 0).
+ rot_axis (int): rotation axis of bbox. Default: 2.
+ center_mode (bool): indicate the center of bbox is bottom center
+ or gravity center. avaliable mode
+ ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+ mode (str): indicate type of the input points, avaliable mode
+ ['xyz', 'xyzrgb']. Default: 'xyz'.
+ """
+
+ def __init__(self,
+ points,
+ bbox3d=None,
+ save_path=None,
+ points_size=2,
+ point_color=(0.5, 0.5, 0.5),
+ bbox_color=(0, 1, 0),
+ points_in_box_color=(1, 0, 0),
+ rot_axis=2,
+ center_mode='lidar_bottom',
+ mode='xyz'):
+ super(Visualizer, self).__init__()
+ assert 0 <= rot_axis <= 2
+
+ # init visualizer
+ self.o3d_visualizer = o3d.visualization.Visualizer()
+ self.o3d_visualizer.create_window()
+ mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+ size=1, origin=[0, 0, 0]) # create coordinate frame
+ self.o3d_visualizer.add_geometry(mesh_frame)
+
+ self.points_size = points_size
+ self.point_color = point_color
+ self.bbox_color = bbox_color
+ self.points_in_box_color = points_in_box_color
+ self.rot_axis = rot_axis
+ self.center_mode = center_mode
+ self.mode = mode
+ self.seg_num = 0
+
+ # draw points
+ if points is not None:
+ self.pcd, self.points_colors = _draw_points(
+ points, self.o3d_visualizer, points_size, point_color, mode)
+
+ # draw boxes
+ if bbox3d is not None:
+ _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors,
+ self.pcd, bbox_color, points_in_box_color, rot_axis,
+ center_mode, mode)
+
+ def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None):
+ """Add bounding box to visualizer.
+
+ Args:
+ bbox3d (numpy.array, shape=[M, 7]):
+ 3D bbox (x, y, z, dx, dy, dz, yaw) to be visualized.
+ The 3d bbox is in mode of Box3DMode.DEPTH with
+ gravity_center (please refer to core.structures.box_3d_mode).
+ bbox_color (tuple[float]): the color of bbox. Defaule: None.
+ points_in_box_color (tuple[float]): the color of points which
+ are in bbox3d. Defaule: None.
+ """
+ if bbox_color is None:
+ bbox_color = self.bbox_color
+ if points_in_box_color is None:
+ points_in_box_color = self.points_in_box_color
+ _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd,
+ bbox_color, points_in_box_color, self.rot_axis,
+ self.center_mode, self.mode)
+
+ def add_seg_mask(self, seg_mask_colors):
+ """Add segmentation mask to visualizer via per-point colorization.
+
+ Args:
+ seg_mask_colors (numpy.array, shape=[N, 6]):
+ The segmentation mask whose first 3 dims are point coordinates
+ and last 3 dims are converted colors.
+ """
+ # we can't draw the colors on existing points
+ # in case gt and pred mask would overlap
+ # instead we set a large offset along x-axis for each seg mask
+ self.seg_num += 1
+ offset = (np.array(self.pcd.points).max(0) -
+ np.array(self.pcd.points).min(0))[0] * 1.2 * self.seg_num
+ mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+ size=1, origin=[offset, 0, 0]) # create coordinate frame for seg
+ self.o3d_visualizer.add_geometry(mesh_frame)
+ seg_points = copy.deepcopy(seg_mask_colors)
+ seg_points[:, 0] += offset
+ _draw_points(
+ seg_points, self.o3d_visualizer, self.points_size, mode='xyzrgb')
+
+ def show(self, save_path=None):
+ """Visualize the points cloud.
+
+ Args:
+ save_path (str): path to save image. Default: None.
+ """
+
+ self.o3d_visualizer.run()
+
+ if save_path is not None:
+ self.o3d_visualizer.capture_screen_image(save_path)
+
+ self.o3d_visualizer.destroy_window()
+ return
diff --git a/mmcv/core/visualizer/show_result.py b/mmcv/core/visualizer/show_result.py
new file mode 100644
index 0000000..eb50be1
--- /dev/null
+++ b/mmcv/core/visualizer/show_result.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import trimesh
+from os import path as osp
+
+from mmcv.utils import mkdir_or_exist
+from mmcv.image import imwrite
+
+from .image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img,
+ draw_lidar_bbox3d_on_img)
+
+
+def _write_obj(points, out_filename):
+ """Write points into ``obj`` format for meshlab visualization.
+
+ Args:
+ points (np.ndarray): Points in shape (N, dim).
+ out_filename (str): Filename to be saved.
+ """
+ N = points.shape[0]
+ fout = open(out_filename, 'w')
+ for i in range(N):
+ if points.shape[1] == 6:
+ c = points[i, 3:].astype(int)
+ fout.write(
+ 'v %f %f %f %d %d %d\n' %
+ (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
+
+ else:
+ fout.write('v %f %f %f\n' %
+ (points[i, 0], points[i, 1], points[i, 2]))
+ fout.close()
+
+
+def _write_oriented_bbox(scene_bbox, out_filename):
+ """Export oriented (around Z axis) scene bbox to meshes.
+
+ Args:
+ scene_bbox(list[ndarray] or ndarray): xyz pos of center and
+ 3 lengths (dx,dy,dz) and heading angle around Z axis.
+ Y forward, X right, Z upward. heading angle of positive X is 0,
+ heading angle of positive Y is 90 degrees.
+ out_filename(str): Filename.
+ """
+
+ def heading2rotmat(heading_angle):
+ rotmat = np.zeros((3, 3))
+ rotmat[2, 2] = 1
+ cosval = np.cos(heading_angle)
+ sinval = np.sin(heading_angle)
+ rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]])
+ return rotmat
+
+ def convert_oriented_box_to_trimesh_fmt(box):
+ ctr = box[:3]
+ lengths = box[3:6]
+ trns = np.eye(4)
+ trns[0:3, 3] = ctr
+ trns[3, 3] = 1.0
+ trns[0:3, 0:3] = heading2rotmat(box[6])
+ box_trimesh_fmt = trimesh.creation.box(lengths, trns)
+ return box_trimesh_fmt
+
+ if len(scene_bbox) == 0:
+ scene_bbox = np.zeros((1, 7))
+ scene = trimesh.scene.Scene()
+ for box in scene_bbox:
+ scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box))
+
+ mesh_list = trimesh.util.concatenate(scene.dump())
+ # save to obj file
+ trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='obj')
+
+ return
+
+
+def show_result(points,
+ gt_bboxes,
+ pred_bboxes,
+ out_dir,
+ filename,
+ show=True,
+ snapshot=False):
+ """Convert results into format that is directly readable for meshlab.
+
+ Args:
+ points (np.ndarray): Points.
+ gt_bboxes (np.ndarray): Ground truth boxes.
+ pred_bboxes (np.ndarray): Predicted boxes.
+ out_dir (str): Path of output directory
+ filename (str): Filename of the current frame.
+ show (bool): Visualize the results online. Defaults to False.
+ snapshot (bool): Whether to save the online results. Defaults to False.
+ """
+ result_path = osp.join(out_dir, filename)
+ mkdir_or_exist(result_path)
+
+ if show:
+ from .open3d_vis import Visualizer
+
+ vis = Visualizer(points)
+ if pred_bboxes is not None:
+ vis.add_bboxes(bbox3d=pred_bboxes)
+ if gt_bboxes is not None:
+ vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1))
+ show_path = osp.join(result_path,
+ f'{filename}_online.png') if snapshot else None
+ vis.show(show_path)
+
+ if points is not None:
+ _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
+
+ if gt_bboxes is not None:
+ # bottom center to gravity center
+ gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2
+ # the positive direction for yaw in meshlab is clockwise
+ gt_bboxes[:, 6] *= -1
+ _write_oriented_bbox(gt_bboxes,
+ osp.join(result_path, f'{filename}_gt.obj'))
+
+ if pred_bboxes is not None:
+ # bottom center to gravity center
+ pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2
+ # the positive direction for yaw in meshlab is clockwise
+ pred_bboxes[:, 6] *= -1
+ _write_oriented_bbox(pred_bboxes,
+ osp.join(result_path, f'{filename}_pred.obj'))
+
+
+def show_seg_result(points,
+ gt_seg,
+ pred_seg,
+ out_dir,
+ filename,
+ palette,
+ ignore_index=None,
+ show=True,
+ snapshot=False):
+ """Convert results into format that is directly readable for meshlab.
+
+ Args:
+ points (np.ndarray): Points.
+ gt_seg (np.ndarray): Ground truth segmentation mask.
+ pred_seg (np.ndarray): Predicted segmentation mask.
+ out_dir (str): Path of output directory
+ filename (str): Filename of the current frame.
+ palette (np.ndarray): Mapping between class labels and colors.
+ ignore_index (int, optional): The label index to be ignored, e.g. \
+ unannotated points. Defaults to None.
+ show (bool, optional): Visualize the results online. Defaults to False.
+ snapshot (bool, optional): Whether to save the online results. \
+ Defaults to False.
+ """
+ # we need 3D coordinates to visualize segmentation mask
+ if gt_seg is not None or pred_seg is not None:
+ assert points is not None, \
+ '3D coordinates are required for segmentation visualization'
+
+ # filter out ignored points
+ if gt_seg is not None and ignore_index is not None:
+ if points is not None:
+ points = points[gt_seg != ignore_index]
+ if pred_seg is not None:
+ pred_seg = pred_seg[gt_seg != ignore_index]
+ gt_seg = gt_seg[gt_seg != ignore_index]
+
+ if gt_seg is not None:
+ gt_seg_color = palette[gt_seg]
+ gt_seg_color = np.concatenate([points[:, :3], gt_seg_color], axis=1)
+ if pred_seg is not None:
+ pred_seg_color = palette[pred_seg]
+ pred_seg_color = np.concatenate([points[:, :3], pred_seg_color],
+ axis=1)
+
+ result_path = osp.join(out_dir, filename)
+ mkdir_or_exist(result_path)
+
+ # online visualization of segmentation mask
+ # we show three masks in a row, scene_points, gt_mask, pred_mask
+ if show:
+ from .open3d_vis import Visualizer
+ mode = 'xyzrgb' if points.shape[1] == 6 else 'xyz'
+ vis = Visualizer(points, mode=mode)
+ if gt_seg is not None:
+ vis.add_seg_mask(gt_seg_color)
+ if pred_seg is not None:
+ vis.add_seg_mask(pred_seg_color)
+ show_path = osp.join(result_path,
+ f'{filename}_online.png') if snapshot else None
+ vis.show(show_path)
+
+ if points is not None:
+ _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
+
+ if gt_seg is not None:
+ _write_obj(gt_seg_color, osp.join(result_path, f'{filename}_gt.obj'))
+
+ if pred_seg is not None:
+ _write_obj(pred_seg_color, osp.join(result_path,
+ f'{filename}_pred.obj'))
+
+
+def show_multi_modality_result(img,
+ gt_bboxes,
+ pred_bboxes,
+ proj_mat,
+ out_dir,
+ filename,
+ box_mode='lidar',
+ img_metas=None,
+ show=True,
+ gt_bbox_color=(61, 102, 255),
+ pred_bbox_color=(241, 101, 72)):
+ """Convert multi-modality detection results into 2D results.
+
+ Project the predicted 3D bbox to 2D image plane and visualize them.
+
+ Args:
+ img (np.ndarray): The numpy array of image in cv2 fashion.
+ gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
+ pred_bboxes (:obj:`BaseInstance3DBoxes`): Predicted boxes.
+ proj_mat (numpy.array, shape=[4, 4]): The projection matrix
+ according to the camera intrinsic parameters.
+ out_dir (str): Path of output directory.
+ filename (str): Filename of the current frame.
+ box_mode (str): Coordinate system the boxes are in. Should be one of
+ 'depth', 'lidar' and 'camera'. Defaults to 'lidar'.
+ img_metas (dict): Used in projecting depth bbox.
+ show (bool): Visualize the results online. Defaults to False.
+ gt_bbox_color (str or tuple(int)): Color of bbox lines.
+ The tuple of color should be in BGR order. Default: (255, 102, 61)
+ pred_bbox_color (str or tuple(int)): Color of bbox lines.
+ The tuple of color should be in BGR order. Default: (72, 101, 241)
+ """
+ if box_mode == 'depth':
+ draw_bbox = draw_depth_bbox3d_on_img
+ elif box_mode == 'lidar':
+ draw_bbox = draw_lidar_bbox3d_on_img
+ elif box_mode == 'camera':
+ draw_bbox = draw_camera_bbox3d_on_img
+ else:
+ raise NotImplementedError(f'unsupported box mode {box_mode}')
+
+ result_path = osp.join(out_dir, filename)
+ mkdir_or_exist(result_path)
+
+ if show:
+ show_img = img.copy()
+ if gt_bboxes is not None:
+ show_img = draw_bbox(
+ gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color)
+ if pred_bboxes is not None:
+ show_img = draw_bbox(
+ pred_bboxes,
+ show_img,
+ proj_mat,
+ img_metas,
+ color=pred_bbox_color)
+ mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0)
+
+ if img is not None:
+ imwrite(img, osp.join(result_path, f'{filename}_img.png'))
+
+ if gt_bboxes is not None:
+ gt_img = draw_bbox(
+ gt_bboxes, img, proj_mat, img_metas, color=gt_bbox_color)
+ imwrite(gt_img, osp.join(result_path, f'{filename}_gt.png'))
+
+ if pred_bboxes is not None:
+ pred_img = draw_bbox(
+ pred_bboxes, img, proj_mat, img_metas, color=pred_bbox_color)
+ imwrite(pred_img, osp.join(result_path, f'{filename}_pred.png'))
diff --git a/mmcv/core/voxel/__init__.py b/mmcv/core/voxel/__init__.py
new file mode 100644
index 0000000..8d69543
--- /dev/null
+++ b/mmcv/core/voxel/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_voxel_generator
+from .voxel_generator import VoxelGenerator
+
+__all__ = ['build_voxel_generator', 'VoxelGenerator']
diff --git a/mmcv/core/voxel/builder.py b/mmcv/core/voxel/builder.py
new file mode 100644
index 0000000..d7fe494
--- /dev/null
+++ b/mmcv/core/voxel/builder.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from . import voxel_generator
+from mmcv.utils import obj_from_dict
+
+def build_voxel_generator(cfg, **kwargs):
+ """Builder of voxel generator."""
+ if isinstance(cfg, voxel_generator.VoxelGenerator):
+ return cfg
+ elif isinstance(cfg, dict):
+ return obj_from_dict(
+ cfg, voxel_generator, default_args=kwargs)
+ else:
+ raise TypeError('Invalid type {} for building a sampler'.format(
+ type(cfg)))
diff --git a/mmcv/core/voxel/voxel_generator.py b/mmcv/core/voxel/voxel_generator.py
new file mode 100644
index 0000000..615b749
--- /dev/null
+++ b/mmcv/core/voxel/voxel_generator.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+
+
+class VoxelGenerator(object):
+ """Voxel generator in numpy implementation.
+
+ Args:
+ voxel_size (list[float]): Size of a single voxel
+ point_cloud_range (list[float]): Range of points
+ max_num_points (int): Maximum number of points in a single voxel
+ max_voxels (int, optional): Maximum number of voxels.
+ Defaults to 20000.
+ """
+
+ def __init__(self,
+ voxel_size,
+ point_cloud_range,
+ max_num_points,
+ max_voxels=20000):
+
+ point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+ # [0, -40, -3, 70.4, 40, 1]
+ voxel_size = np.array(voxel_size, dtype=np.float32)
+ grid_size = (point_cloud_range[3:] -
+ point_cloud_range[:3]) / voxel_size
+ grid_size = np.round(grid_size).astype(np.int64)
+
+ self._voxel_size = voxel_size
+ self._point_cloud_range = point_cloud_range
+ self._max_num_points = max_num_points
+ self._max_voxels = max_voxels
+ self._grid_size = grid_size
+
+ def generate(self, points):
+ """Generate voxels given points."""
+ return points_to_voxel(points, self._voxel_size,
+ self._point_cloud_range, self._max_num_points,
+ True, self._max_voxels)
+
+ @property
+ def voxel_size(self):
+ """list[float]: Size of a single voxel."""
+ return self._voxel_size
+
+ @property
+ def max_num_points_per_voxel(self):
+ """int: Maximum number of points per voxel."""
+ return self._max_num_points
+
+ @property
+ def point_cloud_range(self):
+ """list[float]: Range of point cloud."""
+ return self._point_cloud_range
+
+ @property
+ def grid_size(self):
+ """np.ndarray: The size of grids."""
+ return self._grid_size
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ indent = ' ' * (len(repr_str) + 1)
+ repr_str += f'(voxel_size={self._voxel_size},\n'
+ repr_str += indent + 'point_cloud_range='
+ repr_str += f'{self._point_cloud_range.tolist()},\n'
+ repr_str += indent + f'max_num_points={self._max_num_points},\n'
+ repr_str += indent + f'max_voxels={self._max_voxels},\n'
+ repr_str += indent + f'grid_size={self._grid_size.tolist()}'
+ repr_str += ')'
+ return repr_str
+
+
+def points_to_voxel(points,
+ voxel_size,
+ coors_range,
+ max_points=35,
+ reverse_index=True,
+ max_voxels=20000):
+ """convert kitti points(N, >=3) to voxels.
+
+ Args:
+ points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
+ points[:, 3:] contain other information such as reflectivity.
+ voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
+ coors_range (list[float | tuple[float] | ndarray]): Voxel range. \
+ format: xyzxyz, minmax
+ max_points (int): Indicate maximum points contained in a voxel.
+ reverse_index (bool): Whether return reversed coordinates. \
+ if points has xyz format and reverse_index is True, output \
+ coordinates will be zyx format, but points in features always \
+ xyz format.
+ max_voxels (int): Maximum number of voxels this function creates. \
+ For second, 20000 is a good choice. Points should be shuffled for \
+ randomness before this function because max_voxels drops points.
+
+ Returns:
+ tuple[np.ndarray]:
+ voxels: [M, max_points, ndim] float tensor. only contain points.
+ coordinates: [M, 3] int32 tensor.
+ num_points_per_voxel: [M] int32 tensor.
+ """
+ if not isinstance(voxel_size, np.ndarray):
+ voxel_size = np.array(voxel_size, dtype=points.dtype)
+ if not isinstance(coors_range, np.ndarray):
+ coors_range = np.array(coors_range, dtype=points.dtype)
+ voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
+ voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
+ if reverse_index:
+ voxelmap_shape = voxelmap_shape[::-1]
+ # don't create large array in jit(nopython=True) code.
+ num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
+ coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
+ voxels = np.zeros(
+ shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+ coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
+ if reverse_index:
+ voxel_num = _points_to_voxel_reverse_kernel(
+ points, voxel_size, coors_range, num_points_per_voxel,
+ coor_to_voxelidx, voxels, coors, max_points, max_voxels)
+
+ else:
+ voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
+ num_points_per_voxel,
+ coor_to_voxelidx, voxels, coors,
+ max_points, max_voxels)
+
+ coors = coors[:voxel_num]
+ voxels = voxels[:voxel_num]
+ num_points_per_voxel = num_points_per_voxel[:voxel_num]
+
+ return voxels, coors, num_points_per_voxel
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_reverse_kernel(points,
+ voxel_size,
+ coors_range,
+ num_points_per_voxel,
+ coor_to_voxelidx,
+ voxels,
+ coors,
+ max_points=35,
+ max_voxels=20000):
+ """convert kitti points(N, >=3) to voxels.
+
+ Args:
+ points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
+ points[:, 3:] contain other information such as reflectivity.
+ voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size \
+ coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \
+ format: xyzxyz, minmax
+ num_points_per_voxel (int): Number of points per voxel.
+ coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \
+ which has the same shape as the complete voxel map. It indicates \
+ the index of each corresponding voxel.
+ voxels (np.ndarray): Created empty voxels.
+ coors (np.ndarray): Created coordinates of each voxel.
+ max_points (int): Indicate maximum points contained in a voxel.
+ max_voxels (int): Maximum number of voxels this function create. \
+ for second, 20000 is a good choice. Points should be shuffled for \
+ randomness before this function because max_voxels drops points.
+
+ Returns:
+ tuple[np.ndarray]:
+ voxels: Shape [M, max_points, ndim], only contain points.
+ coordinates: Shape [M, 3].
+ num_points_per_voxel: Shape [M].
+ """
+ # put all computations to one loop.
+ # we shouldn't create large array in main jit code, otherwise
+ # reduce performance
+ N = points.shape[0]
+ # ndim = points.shape[1] - 1
+ ndim = 3
+ ndim_minus_1 = ndim - 1
+ grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+ # np.round(grid_size)
+ # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+ grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+ coor = np.zeros(shape=(3, ), dtype=np.int32)
+ voxel_num = 0
+ failed = False
+ for i in range(N):
+ failed = False
+ for j in range(ndim):
+ c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+ if c < 0 or c >= grid_size[j]:
+ failed = True
+ break
+ coor[ndim_minus_1 - j] = c
+ if failed:
+ continue
+ voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+ if voxelidx == -1:
+ voxelidx = voxel_num
+ if voxel_num >= max_voxels:
+ continue
+ voxel_num += 1
+ coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+ coors[voxelidx] = coor
+ num = num_points_per_voxel[voxelidx]
+ if num < max_points:
+ voxels[voxelidx, num] = points[i]
+ num_points_per_voxel[voxelidx] += 1
+ return voxel_num
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_kernel(points,
+ voxel_size,
+ coors_range,
+ num_points_per_voxel,
+ coor_to_voxelidx,
+ voxels,
+ coors,
+ max_points=35,
+ max_voxels=20000):
+ """convert kitti points(N, >=3) to voxels.
+
+ Args:
+ points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
+ points[:, 3:] contain other information such as reflectivity.
+ voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size.
+ coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \
+ format: xyzxyz, minmax
+ num_points_per_voxel (int): Number of points per voxel.
+ coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \
+ which has the same shape as the complete voxel map. It indicates \
+ the index of each corresponding voxel.
+ voxels (np.ndarray): Created empty voxels.
+ coors (np.ndarray): Created coordinates of each voxel.
+ max_points (int): Indicate maximum points contained in a voxel.
+ max_voxels (int): Maximum number of voxels this function create. \
+ for second, 20000 is a good choice. Points should be shuffled for \
+ randomness before this function because max_voxels drops points.
+
+ Returns:
+ tuple[np.ndarray]:
+ voxels: Shape [M, max_points, ndim], only contain points.
+ coordinates: Shape [M, 3].
+ num_points_per_voxel: Shape [M].
+ """
+ N = points.shape[0]
+ # ndim = points.shape[1] - 1
+ ndim = 3
+ grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+ # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+ grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+
+ # lower_bound = coors_range[:3]
+ # upper_bound = coors_range[3:]
+ coor = np.zeros(shape=(3, ), dtype=np.int32)
+ voxel_num = 0
+ failed = False
+ for i in range(N):
+ failed = False
+ for j in range(ndim):
+ c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+ if c < 0 or c >= grid_size[j]:
+ failed = True
+ break
+ coor[j] = c
+ if failed:
+ continue
+ voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+ if voxelidx == -1:
+ voxelidx = voxel_num
+ if voxel_num >= max_voxels:
+ continue
+ voxel_num += 1
+ coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+ coors[voxelidx] = coor
+ num = num_points_per_voxel[voxelidx]
+ if num < max_points:
+ voxels[voxelidx, num] = points[i]
+ num_points_per_voxel[voxelidx] += 1
+ return voxel_num
diff --git a/mmcv/datasets/B2D_dataset.py b/mmcv/datasets/B2D_dataset.py
new file mode 100644
index 0000000..530c8eb
--- /dev/null
+++ b/mmcv/datasets/B2D_dataset.py
@@ -0,0 +1,504 @@
+import copy
+import numpy as np
+from mmcv.datasets import DATASETS
+from os import path as osp
+import torch
+from pyquaternion import Quaternion
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+import tempfile
+from .nuscenes_styled_eval_utils import DetectionMetrics, EvalBoxes, DetectionBox,center_distance,accumulate,DetectionMetricDataList,calc_ap, calc_tp, quaternion_yaw
+import json
+
+@DATASETS.register_module()
+class B2D_Dataset(Custom3DDataset):
+
+
+ def __init__(self, queue_length=4, bev_size=(200, 200),overlap_test=False,with_velocity=True,sample_interval=5,name_mapping= None,eval_cfg = None ,*args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.queue_length = queue_length
+ self.overlap_test = overlap_test
+ self.with_velocity = with_velocity
+ if name_mapping is not None:
+ self.NameMapping = name_mapping
+ else:
+ self.NameMapping = {
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ "traffic.speed_limit.30": 'speed_limit',
+ "traffic.speed_limit.40": 'speed_limit',
+ "traffic.speed_limit.50": 'speed_limit',
+ "traffic.speed_limit.60": 'speed_limit',
+ "traffic.traffic_light": 'traffic_light',
+ "traffic.stop": 'stop',
+ }
+ if eval_cfg is not None:
+ self.eval_cfg = eval_cfg
+ else:
+ self.eval_cfg = {
+ "dist_ths": [0.5, 1.0, 2.0, 4.0],
+ "dist_th_tp": 2.0,
+ "min_recall": 0.1,
+ "min_precision": 0.1,
+ "mean_ap_weight": 5,
+ "class_names":['car','van','bicycle'],
+ "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+ "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'}
+ }
+ self.sample_interval = sample_interval
+
+
+ def invert_pose(self, pose):
+ inv_pose = np.eye(4)
+ inv_pose[:3, :3] = np.transpose(pose[:3, :3])
+ inv_pose[:3, -1] = - inv_pose[:3, :3] @ pose[:3, -1]
+ return inv_pose
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ queue = []
+ index_list = list(range(index-self.queue_length*self.sample_interval, index,self.sample_interval))
+ random.shuffle(index_list)
+ index_list = sorted(index_list[1:])
+ index_list.append(index)
+ for i in index_list:
+ i = max(0, i)
+ input_dict = self.get_data_info(i)
+ if input_dict is None:
+ return None
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ queue.append(example)
+ return self.union2one(queue)
+
+
+ def union2one(self, queue):
+ imgs_list = [each['img'].data for each in queue]
+ metas_map = {}
+ prev_scene_token = None
+ prev_pos = None
+ prev_angle = None
+ for i, each in enumerate(queue):
+ metas_map[i] = each['img_metas'].data
+ if metas_map[i]['folder'] != prev_scene_token:
+ metas_map[i]['prev_bev_exists'] = False
+ prev_scene_token = metas_map[i]['folder']
+ prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] = 0
+ metas_map[i]['can_bus'][-1] = 0
+ else:
+ metas_map[i]['prev_bev_exists'] = True
+ tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] -= prev_pos
+ metas_map[i]['can_bus'][-1] -= prev_angle
+ prev_pos = copy.deepcopy(tmp_pos)
+ prev_angle = copy.deepcopy(tmp_angle)
+ queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+ queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[-1]
+ return queue
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+ for i in range(len(info['gt_names'])):
+ if info['gt_names'][i] in self.NameMapping.keys():
+ info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+
+ input_dict = dict(
+ folder=info['folder'],
+ scene_token=info['folder'],
+ frame_idx=info['frame_idx'],
+ ego_yaw=np.nan_to_num(info['ego_yaw'],nan=90),
+ ego_translation=info['ego_translation'],
+ sensors=info['sensors'],
+ gt_ids=info['gt_ids'],
+ gt_boxes=info['gt_boxes'],
+ gt_names=info['gt_names'],
+ ego_vel = info['ego_vel'],
+ ego_accel = info['ego_accel'],
+ ego_rotation_rate = info['ego_rotation_rate'],
+ )
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+ for sensor_type, cam_info in info['sensors'].items():
+ if not 'CAM' in sensor_type:
+ continue
+ image_paths.append(osp.join(self.data_root,cam_info['data_path']))
+ cam2ego = cam_info['cam2ego']
+ intrinsic = cam_info['intrinsic']
+ intrinsic_pad = np.eye(4)
+ intrinsic_pad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2cam = self.invert_pose(cam2ego) @ lidar2ego
+ lidar2img = intrinsic_pad @ lidar2cam
+ lidar2img_rts.append(lidar2img)
+ cam_intrinsics.append(intrinsic_pad)
+ lidar2cam_rts.append(lidar2cam)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam_intrinsic=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ ))
+
+ if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+ yaw = input_dict['ego_yaw']
+ rotation = list(Quaternion(axis=[0, 0, 1], radians=yaw))
+ if yaw < 0:
+ yaw += 2*np.pi
+ yaw_in_degree = yaw / np.pi * 180
+ can_bus = np.zeros(18)
+ can_bus[:3] = input_dict['ego_translation']
+ can_bus[3:7] = rotation
+ can_bus[7:10] = input_dict['ego_vel']
+ can_bus[10:13] = input_dict['ego_accel']
+ can_bus[13:16] = input_dict['ego_rotation_rate']
+ can_bus[16] = yaw
+ can_bus[17] = yaw_in_degree
+ input_dict['can_bus'] = can_bus
+
+ return input_dict
+
+
+ def get_ann_info(self, index):
+ """Get annotation info according to the given index.
+
+ Args:
+ index (int): Index of the annotation data to get.
+
+ Returns:
+ dict: Annotation information consists of the following keys:
+
+ - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+ 3D ground truth bboxes
+ - gt_labels_3d (np.ndarray): Labels of ground truths.
+ - gt_names (list[str]): Class names of ground truths.
+ """
+ info = self.data_infos[index]
+ # filter out bbox containing no points
+ mask = (info['num_points'] >= -1)
+ gt_bboxes_3d = info['gt_boxes'][mask]
+ gt_names_3d = info['gt_names'][mask]
+ gt_labels_3d = []
+
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+ if not self.with_velocity:
+ gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ gt_names=gt_names_3d)
+ return anns_results
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+ result_path = result_files['pts_bbox']
+ with open(result_path) as f:
+ result_data = json.load(f)
+ pred_boxes = EvalBoxes.deserialize(result_data['results'], DetectionBox)
+ meta = result_data['meta']
+
+
+ gt_boxes = self.load_gt()
+
+ metric_data_list = DetectionMetricDataList()
+ for class_name in self.eval_cfg['class_names']:
+ for dist_th in self.eval_cfg['dist_ths']:
+ md = accumulate(gt_boxes, pred_boxes, class_name, center_distance, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+ metrics = DetectionMetrics(self.eval_cfg)
+
+ for class_name in self.eval_cfg['class_names']:
+ # Compute APs.
+ for dist_th in self.eval_cfg['dist_ths']:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.eval_cfg['min_recall'], self.eval_cfg['min_precision'])
+ metrics.add_label_ap(class_name, dist_th, ap)
+
+ # Compute TP metrics.
+ for metric_name in self.eval_cfg['tp_metrics']:
+ metric_data = metric_data_list[(class_name, self.eval_cfg['dist_th_tp'])]
+ tp = calc_tp(metric_data, self.eval_cfg['min_recall'], metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ metrics_summary = metrics.serialize()
+ metrics_summary['meta'] = meta.copy()
+ print('mAP: %.4f' % (metrics_summary['mean_ap']))
+ err_name_mapping = {
+ 'trans_err': 'mATE',
+ 'scale_err': 'mASE',
+ 'orient_err': 'mAOE',
+ 'vel_err': 'mAVE',
+ }
+ for tp_name, tp_val in metrics_summary['tp_errors'].items():
+ print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+ print('NDS: %.4f' % (metrics_summary['nd_score']))
+ #print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+ # Print per-class metrics.
+ print()
+ print('Per-class results:')
+ print('Object Class\tAP\tATE\tASE\tAOE\tAVE')
+ class_aps = metrics_summary['mean_dist_aps']
+ class_tps = metrics_summary['label_tp_errors']
+ for class_name in class_aps.keys():
+ print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+ % (class_name, class_aps[class_name],
+ class_tps[class_name]['trans_err'],
+ class_tps[class_name]['scale_err'],
+ class_tps[class_name]['orient_err'],
+ class_tps[class_name]['vel_err']))
+
+ detail = dict()
+ metric_prefix = 'bbox_NuScenes'
+ for name in self.eval_cfg['class_names']:
+ for k, v in metrics_summary['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics_summary['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics_summary['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,self.eval_cfg['err_name_maping'][k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics_summary['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics_summary['mean_ap']
+
+
+ return detail
+
+
+ def load_gt(self):
+ all_annotations = EvalBoxes()
+ for i in range(len(self.data_infos)):
+ sample_boxes = []
+ sample_data = self.data_infos[i]
+
+ gt_boxes = sample_data['gt_boxes']
+
+ for j in range(gt_boxes.shape[0]):
+ class_name = self.NameMapping[sample_data['gt_names'][j]]
+ if not class_name in self.eval_cfg['class_range'].keys():
+ continue
+ range_x, range_y = self.eval_cfg['class_range'][class_name]
+ if abs(gt_boxes[j,0]) > range_x or abs(gt_boxes[j,1]) > range_y:
+ continue
+ sample_boxes.append(DetectionBox(
+ sample_token=sample_data['folder']+'_'+str(sample_data['frame_idx']),
+ translation=gt_boxes[j,0:3],
+ size=gt_boxes[j,3:6],
+ rotation=list(Quaternion(axis=[0, 0, 1], radians=-gt_boxes[j,6]-np.pi/2)),
+ velocity=gt_boxes[j,7:9],
+ num_pts=int(sample_data['num_points'][j]),
+ detection_name=self.NameMapping[sample_data['gt_names'][j]],
+ detection_score=-1.0,
+ attribute_name=self.NameMapping[sample_data['gt_names'][j]]
+ ))
+ all_annotations.add_boxes(sample_data['folder']+'_'+str(sample_data['frame_idx']), sample_boxes)
+ return all_annotations
+
+ def _format_bbox(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+
+ Returns:
+ str: Path of the output json file.
+ """
+
+
+ nusc_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(track_iter_progress(results)):
+ #pdb.set_trace()
+ annos = []
+ box3d = det['boxes_3d']
+ scores = det['scores_3d']
+ labels = det['labels_3d']
+ box_gravity_center = box3d.gravity_center
+ box_dims = box3d.dims
+ box_yaw = box3d.yaw.numpy()
+ box_yaw = -box_yaw - np.pi / 2
+ sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+
+
+
+ for i in range(len(box3d)):
+ #import pdb;pdb.set_trace()
+ quat = list(Quaternion(axis=[0, 0, 1], radians=box_yaw[i]))
+ velocity = [box3d.tensor[i, 7].item(),box3d.tensor[i, 8].item()]
+ name = mapped_class_names[labels[i]]
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box_gravity_center[i].tolist(),
+ size=box_dims[i].tolist(),
+ rotation=quat,
+ velocity=velocity,
+ detection_name=name,
+ detection_score=scores[i].item(),
+ attribute_name=name)
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ }
+
+ mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def format_results(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ # assert len(results) == len(self), (
+ # 'The length of results is not equal to the dataset len: {} != {}'.
+ # format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+ return result_files, tmp_dir
+
diff --git a/mmcv/datasets/B2D_e2e_dataset.py b/mmcv/datasets/B2D_e2e_dataset.py
new file mode 100644
index 0000000..9f5b4e0
--- /dev/null
+++ b/mmcv/datasets/B2D_e2e_dataset.py
@@ -0,0 +1,855 @@
+import copy
+import numpy as np
+import os
+from os import path as osp
+import torch
+import random
+import json, pickle
+import tempfile
+import cv2
+from pyquaternion import Quaternion
+from mmcv.datasets import DATASETS
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from mmcv.datasets.pipelines import to_tensor
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+from .nuscenes_styled_eval_utils import DetectionMetrics, EvalBoxes, DetectionBox,center_distance,accumulate,DetectionMetricDataList,calc_ap, calc_tp, quaternion_yaw
+from prettytable import PrettyTable
+
+
+
+@DATASETS.register_module()
+class B2D_E2E_Dataset(Custom3DDataset):
+ def __init__(self, queue_length=4, bev_size=(200, 200),overlap_test=False,with_velocity=True,sample_interval=5,name_mapping= None,eval_cfg = None, map_root =None,map_file=None,past_frames=4, future_frames=4,predict_frames=12,planning_frames=6,patch_size = [102.4, 102.4],point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] ,occ_receptive_field=3,occ_n_future=6,occ_filter_invalid_sample=False,occ_filter_by_valid_flag=False,eval_mod=None,*args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.queue_length = queue_length
+ self.bev_size = (200, 200)
+ self.overlap_test = overlap_test
+ self.with_velocity = with_velocity
+ self.NameMapping = name_mapping
+ self.eval_cfg = eval_cfg
+ self.sample_interval = sample_interval
+ self.past_frames = past_frames
+ self.future_frames = future_frames
+ self.predict_frames = predict_frames
+ self.planning_frames = planning_frames
+ self.map_root = map_root
+ self.map_file = map_file
+ self.point_cloud_range = np.array(point_cloud_range)
+ self.patch_size = patch_size
+ self.occ_receptive_field = occ_receptive_field # past + current
+ self.occ_n_future = occ_n_future # future only
+ self.occ_filter_invalid_sample = occ_filter_invalid_sample
+ self.occ_filter_by_valid_flag = occ_filter_by_valid_flag
+ self.occ_only_total_frames = 7 # NOTE: hardcode, not influenced by planning
+ self.eval_mod = eval_mod
+ self.map_element_class = {'Broken':0, 'Solid':1, 'SolidSolid':2,'Center':3,'TrafficLight':4,'StopSign':5}
+ with open(self.map_file,'rb') as f:
+ self.map_infos = pickle.load(f)
+
+ def invert_pose(self, pose):
+ inv_pose = np.eye(4)
+ inv_pose[:3, :3] = np.transpose(pose[:3, :3])
+ inv_pose[:3, -1] = - inv_pose[:3, :3] @ pose[:3, -1]
+ return inv_pose
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ queue = []
+ index_list = list(range(index-self.queue_length*self.sample_interval, index,self.sample_interval))
+ random.shuffle(index_list)
+ index_list = sorted(index_list[1:])
+ index_list.append(index)
+ for i in index_list:
+ i = max(0, i)
+ input_dict = self.get_data_info(i)
+ if input_dict is None:
+ return None
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ queue.append(example)
+ return self.union2one(queue)
+
+ def union2one(self, queue):
+ imgs_list = [each['img'].data for each in queue]
+ gt_labels_3d_list = [each['gt_labels_3d'].data for each in queue]
+ gt_sdc_label_list = [each['gt_sdc_label'].data for each in queue]
+ gt_inds_list = [to_tensor(each['gt_inds']) for each in queue]
+ gt_bboxes_3d_list = [each['gt_bboxes_3d'].data for each in queue]
+ gt_past_traj_list = [to_tensor(each['gt_past_traj']) for each in queue]
+ gt_past_traj_mask_list = [ to_tensor(each['gt_past_traj_mask']) for each in queue]
+ gt_sdc_bbox_list = [each['gt_sdc_bbox'].data for each in queue]
+ l2g_r_mat_list = [to_tensor(each['l2g_r_mat']) for each in queue]
+ l2g_t_list = [to_tensor(each['l2g_t']) for each in queue]
+ timestamp_list = [to_tensor(each['timestamp']) for each in queue]
+ gt_fut_traj = to_tensor(queue[-1]['gt_fut_traj'])
+ gt_fut_traj_mask = to_tensor(queue[-1]['gt_fut_traj_mask'])
+ if 'gt_future_boxes' in queue[-1]:
+ gt_future_boxes_list = queue[-1]['gt_future_boxes']
+ else:
+ gt_future_boxes_list = None
+ if 'gt_future_labels' in queue[-1]:
+ gt_future_labels_list = [to_tensor(each) for each in queue[-1]['gt_future_labels']]
+ else:
+ gt_future_labels_list = None
+
+ metas_map = {}
+ prev_scene_token = None
+ prev_pos = None
+ prev_angle = None
+ for i, each in enumerate(queue):
+ metas_map[i] = each['img_metas'].data
+ if metas_map[i]['folder'] != prev_scene_token:
+ metas_map[i]['prev_bev_exists'] = False
+ prev_scene_token = metas_map[i]['folder']
+ prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] = 0
+ metas_map[i]['can_bus'][-1] = 0
+ else:
+ metas_map[i]['prev_bev_exists'] = True
+ tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] -= prev_pos
+ metas_map[i]['can_bus'][-1] -= prev_angle
+ prev_pos = copy.deepcopy(tmp_pos)
+ prev_angle = copy.deepcopy(tmp_angle)
+ queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+ queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[-1]
+ queue['gt_labels_3d'] = DC(gt_labels_3d_list)
+ queue['gt_sdc_label'] = DC(gt_sdc_label_list)
+ queue['gt_inds'] = DC(gt_inds_list)
+ queue['gt_bboxes_3d'] = DC(gt_bboxes_3d_list, cpu_only=True)
+ queue['gt_sdc_bbox'] = DC(gt_sdc_bbox_list, cpu_only=True)
+ queue['l2g_r_mat'] = DC(l2g_r_mat_list)
+ queue['l2g_t'] = DC(l2g_t_list)
+ queue['timestamp'] = DC(timestamp_list)
+ queue['gt_fut_traj'] = DC(gt_fut_traj)
+ queue['gt_fut_traj_mask'] = DC(gt_fut_traj_mask)
+ queue['gt_past_traj'] = DC(gt_past_traj_list)
+ queue['gt_past_traj_mask'] = DC(gt_past_traj_mask_list)
+ if gt_future_boxes_list is not None:
+ queue['gt_future_boxes'] = DC(gt_future_boxes_list, cpu_only=True)
+ if gt_future_labels_list is not None:
+ queue['gt_future_labels'] = DC(gt_future_labels_list)
+
+ return queue
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+
+ for i in range(len(info['gt_names'])):
+ if info['gt_names'][i] in self.NameMapping.keys():
+ info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+
+
+ gt_masks,gt_labels,gt_bboxes = self.get_map_info(index)
+
+
+ input_dict = dict(
+ folder=info['folder'],
+ scene_token=info['folder'],
+ frame_idx=info['frame_idx'],
+ ego_yaw=np.nan_to_num(info['ego_yaw'],nan=np.pi/2),
+ ego_translation=info['ego_translation'],
+ sensors=info['sensors'],
+ world2lidar=info['sensors']['LIDAR_TOP']['world2lidar'],
+ gt_ids=info['gt_ids'],
+ gt_boxes=info['gt_boxes'],
+ gt_names=info['gt_names'],
+ ego_vel = info['ego_vel'],
+ ego_accel = info['ego_accel'],
+ ego_rotation_rate = info['ego_rotation_rate'],
+ npc2world = info['npc2world'],
+ gt_lane_labels=gt_labels,
+ gt_lane_bboxes=gt_bboxes,
+ gt_lane_masks=gt_masks,
+ timestamp=info['frame_idx']/10
+
+ )
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+ for sensor_type, cam_info in info['sensors'].items():
+ if not 'CAM' in sensor_type:
+ continue
+ image_paths.append(osp.join(self.data_root,cam_info['data_path']))
+ # obtain lidar to image transformation matrix
+ cam2ego = cam_info['cam2ego']
+ intrinsic = cam_info['intrinsic']
+ intrinsic_pad = np.eye(4)
+ intrinsic_pad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2cam = self.invert_pose(cam2ego) @ lidar2ego
+ lidar2img = intrinsic_pad @ lidar2cam
+ lidar2img_rts.append(lidar2img)
+ cam_intrinsics.append(intrinsic_pad)
+ lidar2cam_rts.append(lidar2cam)
+ ego2world = np.eye(4)
+ ego2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=input_dict['ego_yaw']).rotation_matrix
+ ego2world[0:3,3] = input_dict['ego_translation']
+ lidar2global = ego2world @ lidar2ego
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam_intrinsic=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ l2g_r_mat=lidar2global[0:3,0:3],
+ l2g_t=lidar2global[0:3,3]
+
+ ))
+
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+ yaw = input_dict['ego_yaw']
+ rotation = list(Quaternion(axis=[0, 0, 1], radians=yaw))
+ if yaw < 0:
+ yaw += 2*np.pi
+ yaw_in_degree = yaw / np.pi * 180
+
+ can_bus = np.zeros(18)
+ can_bus[:3] = input_dict['ego_translation']
+ can_bus[3:7] = rotation
+ can_bus[7:10] = input_dict['ego_vel']
+ can_bus[10:13] = input_dict['ego_accel']
+ can_bus[13:16] = input_dict['ego_rotation_rate']
+ can_bus[16] = yaw
+ can_bus[17] = yaw_in_degree
+ input_dict['can_bus'] = can_bus
+ all_frames = []
+ for adj_idx in range(index-self.occ_receptive_field+1,index+self.occ_n_future+1):
+ if adj_idx<0 or adj_idx>=len(self.data_infos):
+ all_frames.append(-1)
+ elif self.data_infos[adj_idx]['folder'] != self.data_infos[index]['folder']:
+ all_frames.append(-1)
+ else:
+ all_frames.append(adj_idx)
+
+ future_frames = all_frames[self.occ_receptive_field-1:]
+ input_dict['occ_has_invalid_frame'] = (-1 in all_frames[:self.occ_only_total_frames])
+ input_dict['occ_img_is_valid'] = np.array(all_frames) >= 0
+ occ_future_ann_infos = []
+ for future_frame in future_frames:
+ if future_frame >= 0:
+ occ_future_ann_infos.append(
+ self.get_ann_boxes_only(future_frame),
+ )
+ else:
+ occ_future_ann_infos.append(None)
+ input_dict['occ_future_ann_infos'] = occ_future_ann_infos
+
+ input_dict.update(self.occ_get_transforms(future_frames))
+ sdc_planning, sdc_planning_mask = self.get_ego_future_xy(index,self.sample_interval,self.planning_frames)
+ input_dict['sdc_planning'] = sdc_planning
+ input_dict['sdc_planning_mask'] = sdc_planning_mask
+ command = info['command_near']
+ if command < 0:
+ command = 4
+ command -= 1
+ input_dict['command'] = command
+
+ return input_dict
+
+
+ def get_map_info(self, index):
+
+ gt_masks = []
+ gt_labels = []
+ gt_bboxes = []
+
+ ann_info = self.data_infos[index]
+ town_name = ann_info['town_name']
+ map_info = self.map_infos[town_name]
+ lane_points = map_info['lane_points']
+ lane_sample_points = map_info['lane_sample_points']
+ lane_types = map_info['lane_types']
+ trigger_volumes_points = map_info['trigger_volumes_points']
+ trigger_volumes_sample_points = map_info['trigger_volumes_sample_points']
+ trigger_volumes_types = map_info['trigger_volumes_types']
+ world2lidar = np.array(ann_info['sensors']['LIDAR_TOP']['world2lidar'])
+ ego_xy = np.linalg.inv(world2lidar)[0:2,3]
+
+ #1st search
+ max_distance = 100
+ chosed_idx = []
+ for idx in range(len(lane_sample_points)):
+ single_sample_points = lane_sample_points[idx]
+ distance = np.linalg.norm((single_sample_points[:,0:2]-ego_xy),axis=-1)
+ if np.min(distance) < max_distance:
+ chosed_idx.append(idx)
+
+ for idx in chosed_idx:
+ if not lane_types[idx] in self.map_element_class.keys():
+ continue
+ points = lane_points[idx]
+ points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+ points_in_ego = (world2lidar @ points.T).T
+ #print(points_in_ego)
+ mask = (points_in_ego[:,0]>self.point_cloud_range[0]) & (points_in_ego[:,0]self.point_cloud_range[1]) & (points_in_ego[:,1] 1:
+ gt_mask = np.zeros(self.bev_size,dtype=np.uint8)
+ normalized_points = np.zeros_like(points_in_ego_range)
+ normalized_points[:,0] = (points_in_ego_range[:,0] + self.patch_size[0]/2)*(self.bev_size[0]/self.patch_size[0])
+ normalized_points[:,1] = (points_in_ego_range[:,1] + self.patch_size[1]/2)*(self.bev_size[1]/self.patch_size[1])
+ cv2.polylines(gt_mask, [normalized_points.astype(np.int32)], False, color=1, thickness=2)
+ gt_label = self.map_element_class[lane_types[idx]]
+ gt_masks.append(gt_mask)
+ gt_labels.append(gt_label)
+ ys, xs = np.where(gt_mask==1)
+ gt_bboxes.append([min(xs), min(ys), max(xs), max(ys)])
+
+ for idx in range(len(trigger_volumes_points)):
+ if not trigger_volumes_types[idx] in self.map_element_class.keys():
+ continue
+ points = trigger_volumes_points[idx]
+ points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+ points_in_ego = (world2lidar @ points.T).T
+ mask = (points_in_ego[:,0]>self.point_cloud_range[0]) & (points_in_ego[:,0]self.point_cloud_range[1]) & (points_in_ego[:,1]= -1)
+ gt_bboxes_3d = info['gt_boxes'][mask]
+ gt_names_3d = info['gt_names'][mask]
+ gt_inds = info['gt_ids']
+ gt_labels_3d = []
+
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+ if not self.with_velocity:
+ gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ ego_future_track, ego_future_mask = self.get_ego_future_xy(index,self.sample_interval,self.predict_frames)
+ past_track, past_mask = self.get_past_or_future_xy(index,self.sample_interval,self.past_frames,past_or_future='past',local_xy=True)
+ predict_track, predict_mask = self.get_past_or_future_xy(index,self.sample_interval,self.predict_frames,past_or_future='future',local_xy=False)
+ mask = (past_mask.sum((1,2))>0).astype(np.int)
+ future_track = predict_track[:,0:self.future_frames,:]*mask[:,None,None]
+ future_mask = predict_mask[:,0:self.future_frames,:]*mask[:,None,None]
+ full_past_track = np.concatenate([past_track,future_track],axis=1)
+ full_past_mask = np.concatenate([past_mask,future_mask],axis=1)
+ gt_sdc_bbox, gt_sdc_label =self.generate_sdc_info(index)
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ gt_names=gt_names_3d,
+ gt_inds=gt_inds,
+ gt_fut_traj=predict_track,
+ gt_fut_traj_mask=predict_mask,
+ gt_past_traj=full_past_track,
+ gt_past_traj_mask=full_past_mask,
+ gt_sdc_bbox=gt_sdc_bbox,
+ gt_sdc_label=gt_sdc_label,
+ gt_sdc_fut_traj=ego_future_track[:,:,0:2],
+ gt_sdc_fut_traj_mask=ego_future_mask,
+ )
+ return anns_results
+
+ def get_ann_boxes_only(self, index):
+
+ info = self.data_infos[index]
+ for i in range(len(info['gt_names'])):
+ if info['gt_names'][i] in self.NameMapping.keys():
+ info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+ gt_bboxes_3d = info['gt_boxes']
+ gt_names_3d = info['gt_names']
+ gt_inds = info['gt_ids']
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+ if not self.with_velocity:
+ gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+ boxes_annos = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ gt_inds=gt_inds,
+ )
+ return boxes_annos
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def generate_sdc_info(self,idx):
+
+ info = self.data_infos[idx]
+ ego_size = info['ego_size']
+ ego_vel = info['ego_vel']
+ psudo_sdc_bbox = np.array([0.0, 0.0, 0.0, ego_size[0], ego_size[1], ego_size[2], -np.pi, ego_vel[1], ego_vel[0] ])
+ if not self.with_velocity:
+ psudo_sdc_bbox = psudo_sdc_bbox[0:7]
+ gt_bboxes_3d = np.array([psudo_sdc_bbox]).astype(np.float32)
+ gt_names_3d = ['car']
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ gt_labels_3d = DC(to_tensor(gt_labels_3d))
+ gt_bboxes_3d = DC(gt_bboxes_3d, cpu_only=True)
+
+ return gt_bboxes_3d, gt_labels_3d
+
+ def get_past_or_future_xy(self,idx,sample_rate,frames,past_or_future,local_xy=False):
+
+ assert past_or_future in ['past','future']
+ if past_or_future == 'past':
+ adj_idx_list = range(idx-sample_rate,idx-(frames+1)*sample_rate,-sample_rate)
+ else:
+ adj_idx_list = range(idx+sample_rate,idx+(frames+1)*sample_rate,sample_rate)
+
+ cur_frame = self.data_infos[idx]
+ box_ids = cur_frame['gt_ids']
+ adj_track = np.zeros((len(box_ids),frames,2))
+ adj_mask = np.zeros((len(box_ids),frames,2))
+ world2lidar_ego_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+ for i in range(len(box_ids)):
+ box_id = box_ids[i]
+ cur_box2lidar = world2lidar_ego_cur @ cur_frame['npc2world'][i]
+ cur_xy = cur_box2lidar[0:2,3]
+ for j in range(len(adj_idx_list)):
+ adj_idx = adj_idx_list[j]
+ if adj_idx <0 or adj_idx>=len(self.data_infos):
+ break
+ adj_frame = self.data_infos[adj_idx]
+ if adj_frame['folder'] != cur_frame ['folder']:
+ break
+ if len(np.where(adj_frame['gt_ids']==box_id)[0])==0:
+ continue
+ assert len(np.where(adj_frame['gt_ids']==box_id)[0]) == 1 , np.where(adj_frame['gt_ids']==box_id)[0]
+ adj_idx = np.where(adj_frame['gt_ids']==box_id)[0][0]
+ adj_box2lidar = world2lidar_ego_cur @ adj_frame['npc2world'][adj_idx]
+ adj_xy = adj_box2lidar[0:2,3]
+ if local_xy:
+ adj_xy -= cur_xy
+ adj_track[i,j,:] = adj_xy
+ adj_mask[i,j,:] = 1
+ return adj_track, adj_mask
+
+ def get_ego_future_xy(self,idx,sample_rate,frames):
+
+ adj_idx_list = range(idx+sample_rate,idx+(frames+1)*sample_rate,sample_rate)
+ cur_frame = self.data_infos[idx]
+ adj_track = np.zeros((1,frames,3))
+ adj_mask = np.zeros((1,frames,2))
+ world2lidar_ego_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+ for j in range(len(adj_idx_list)):
+ adj_idx = adj_idx_list[j]
+ if adj_idx <0 or adj_idx>=len(self.data_infos):
+ break
+ adj_frame = self.data_infos[adj_idx]
+ if adj_frame['folder'] != cur_frame ['folder']:
+ break
+ world2lidar_ego_adj = adj_frame['sensors']['LIDAR_TOP']['world2lidar']
+ adj2cur_lidar = world2lidar_ego_cur @ np.linalg.inv(world2lidar_ego_adj)
+ xy = adj2cur_lidar[0:2,3]
+ yaw = np.arctan2(adj2cur_lidar[1,0],adj2cur_lidar[0,0])
+ yaw = -yaw -np.pi
+ while yaw > np.pi:
+ yaw -= np.pi*2
+ while yaw < -np.pi:
+ yaw += np.pi*2
+ adj_track[0,j,0:2] = xy
+ adj_track[0,j,2] = yaw
+ adj_mask[0,j,:] = 1
+
+ return adj_track, adj_mask
+
+ def occ_get_transforms(self, indices, data_type=torch.float32):
+
+ l2e_r_mats = []
+ l2e_t_vecs = []
+ e2g_r_mats = []
+ e2g_t_vecs = []
+
+ for index in indices:
+ if index == -1:
+ l2e_r_mats.append(None)
+ l2e_t_vecs.append(None)
+ e2g_r_mats.append(None)
+ e2g_t_vecs.append(None)
+ else:
+ info = self.data_infos[index]
+ lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+ l2e_r = lidar2ego[0:3,0:3]
+ l2e_t = lidar2ego[0:3,3]
+ ego2global = np.linalg.inv(info['world2ego'])
+ e2g_r = ego2global[0:3,0:3]
+ e2g_t = ego2global[0:3,3]
+ l2e_r_mats.append(torch.tensor(l2e_r).to(data_type))
+ l2e_t_vecs.append(torch.tensor(l2e_t).to(data_type))
+ e2g_r_mats.append(torch.tensor(e2g_r).to(data_type))
+ e2g_t_vecs.append(torch.tensor(e2g_t).to(data_type))
+ res = {
+ 'occ_l2e_r_mats': l2e_r_mats,
+ 'occ_l2e_t_vecs': l2e_t_vecs,
+ 'occ_e2g_r_mats': e2g_r_mats,
+ 'occ_e2g_t_vecs': e2g_t_vecs,
+ }
+
+ return res
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+
+ # NOTE:Curremtly we only support evaluation on detection and planning
+
+ result_files, tmp_dir = self.format_results(results['bbox_results'], jsonfile_prefix)
+ result_path = result_files
+ with open(result_path) as f:
+ result_data = json.load(f)
+ pred_boxes = EvalBoxes.deserialize(result_data['results'], DetectionBox)
+ meta = result_data['meta']
+
+ gt_boxes = self.load_gt()
+
+ metric_data_list = DetectionMetricDataList()
+ for class_name in self.eval_cfg['class_names']:
+ for dist_th in self.eval_cfg['dist_ths']:
+ md = accumulate(gt_boxes, pred_boxes, class_name, center_distance, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+ metrics = DetectionMetrics(self.eval_cfg)
+
+ for class_name in self.eval_cfg['class_names']:
+ # Compute APs.
+ for dist_th in self.eval_cfg['dist_ths']:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.eval_cfg['min_recall'], self.eval_cfg['min_precision'])
+ metrics.add_label_ap(class_name, dist_th, ap)
+
+ # Compute TP metrics.
+ for metric_name in self.eval_cfg['tp_metrics']:
+ metric_data = metric_data_list[(class_name, self.eval_cfg['dist_th_tp'])]
+ tp = calc_tp(metric_data, self.eval_cfg['min_recall'], metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ metrics_summary = metrics.serialize()
+ metrics_summary['meta'] = meta.copy()
+ print('mAP: %.4f' % (metrics_summary['mean_ap']))
+ err_name_mapping = {
+ 'trans_err': 'mATE',
+ 'scale_err': 'mASE',
+ 'orient_err': 'mAOE',
+ 'vel_err': 'mAVE',
+ }
+ for tp_name, tp_val in metrics_summary['tp_errors'].items():
+ print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+ print('NDS: %.4f' % (metrics_summary['nd_score']))
+ #print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+ # Print per-class metrics.
+ print()
+ print('Per-class results:')
+ print('Object Class\tAP\tATE\tASE\tAOE\tAVE')
+ class_aps = metrics_summary['mean_dist_aps']
+ class_tps = metrics_summary['label_tp_errors']
+ for class_name in class_aps.keys():
+ print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+ % (class_name, class_aps[class_name],
+ class_tps[class_name]['trans_err'],
+ class_tps[class_name]['scale_err'],
+ class_tps[class_name]['orient_err'],
+ class_tps[class_name]['vel_err']))
+
+ detail = dict()
+ metric_prefix = 'bbox_NuScenes'
+ for name in self.eval_cfg['class_names']:
+ for k, v in metrics_summary['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics_summary['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics_summary['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,self.eval_cfg['err_name_maping'][k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics_summary['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics_summary['mean_ap']
+
+ if 'planning_results_computed' in results.keys():
+ planning_results_computed = results['planning_results_computed']
+ planning_tab = PrettyTable()
+ planning_tab.field_names = [
+ "metrics", "0.5s", "1.0s", "1.5s", "2.0s", "2.5s", "3.0s"]
+ for key in planning_results_computed.keys():
+ value = planning_results_computed[key]
+ row_value = []
+ row_value.append(key)
+ for i in range(len(value)):
+ row_value.append('%.4f' % float(value[i]))
+ planning_tab.add_row(row_value)
+ print(planning_tab)
+
+
+ return detail
+
+ def load_gt(self):
+ all_annotations = EvalBoxes()
+ for i in range(len(self.data_infos)):
+ sample_boxes = []
+ sample_data = self.data_infos[i]
+
+ gt_boxes = sample_data['gt_boxes']
+
+ for j in range(gt_boxes.shape[0]):
+ class_name = self.NameMapping[sample_data['gt_names'][j]]
+ if not class_name in self.eval_cfg['class_range'].keys():
+ continue
+ range_x, range_y = self.eval_cfg['class_range'][class_name]
+ if abs(gt_boxes[j,0]) > range_x or abs(gt_boxes[j,1]) > range_y:
+ continue
+ sample_boxes.append(DetectionBox(
+ sample_token=sample_data['folder']+'_'+str(sample_data['frame_idx']),
+ translation=gt_boxes[j,0:3],
+ size=gt_boxes[j,3:6],
+ rotation=list(Quaternion(axis=[0, 0, 1], radians=-gt_boxes[j,6]-np.pi/2)),
+ velocity=gt_boxes[j,7:9],
+ num_pts=int(sample_data['num_points'][j]),
+ detection_name=self.NameMapping[sample_data['gt_names'][j]],
+ detection_score=-1.0,
+ attribute_name=self.NameMapping[sample_data['gt_names'][j]]
+ ))
+ all_annotations.add_boxes(sample_data['folder']+'_'+str(sample_data['frame_idx']), sample_boxes)
+ return all_annotations
+
+ def _format_bbox(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+
+ Returns:
+ str: Path of the output json file.
+ """
+
+
+ nusc_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(track_iter_progress(results)):
+ #pdb.set_trace()
+ annos = []
+ box3d = det['boxes_3d']
+ scores = det['scores_3d']
+ labels = det['labels_3d']
+ box_gravity_center = box3d.gravity_center
+ box_dims = box3d.dims
+ box_yaw = box3d.yaw.numpy()
+ box_yaw = -box_yaw - np.pi / 2
+ sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+
+
+
+ for i in range(len(box3d)):
+ #import pdb;pdb.set_trace()
+ quat = list(Quaternion(axis=[0, 0, 1], radians=box_yaw[i]))
+ velocity = [box3d.tensor[i, 7].item(),box3d.tensor[i, 8].item()]
+ name = mapped_class_names[labels[i]]
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box_gravity_center[i].tolist(),
+ size=box_dims[i].tolist(),
+ rotation=quat,
+ velocity=velocity,
+ detection_name=name,
+ detection_score=scores[i].item(),
+ attribute_name=name)
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ }
+
+ mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def format_results(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ # assert len(results) == len(self), (
+ # 'The length of results is not equal to the dataset len: {} != {}'.
+ # format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+ return result_files, tmp_dir
+
diff --git a/mmcv/datasets/B2D_vad_dataset.py b/mmcv/datasets/B2D_vad_dataset.py
new file mode 100644
index 0000000..f32cf25
--- /dev/null
+++ b/mmcv/datasets/B2D_vad_dataset.py
@@ -0,0 +1,1037 @@
+import copy
+import numpy as np
+import os
+from os import path as osp
+import torch
+import random
+import json, pickle
+import tempfile
+import cv2
+import pyquaternion
+from pyquaternion import Quaternion
+import mmcv
+from mmcv.datasets import DATASETS
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from mmcv.datasets.pipelines import to_tensor
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+from mmcv.datasets.map_utils.struct import LiDARInstanceLines
+from shapely.geometry import LineString
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .vad_custom_nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.common.utils import center_distance
+import random
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from mmcv.core.bbox.structures.nuscenes_box import CustomNuscenesBox
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.detection.constants import DETECTION_NAMES
+from mmcv.datasets.map_utils.mean_ap import eval_map
+from mmcv.datasets.map_utils.mean_ap import format_res_gt_by_classes
+from .nuscenes_styled_eval_utils import DetectionMetrics, EvalBoxes, DetectionBox,center_distance,accumulate,DetectionMetricDataList,calc_ap, calc_tp, quaternion_yaw
+
+@DATASETS.register_module()
+class B2D_VAD_Dataset(Custom3DDataset):
+
+
+ def __init__(self, queue_length=4, bev_size=(200, 200),overlap_test=False,with_velocity=True,sample_interval=5,name_mapping= None,eval_cfg = None, map_root =None,map_file=None,past_frames=2, future_frames=6,point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] ,polyline_points_num=20,*args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.queue_length = queue_length
+ self.bev_size = bev_size
+ self.overlap_test = overlap_test
+ self.with_velocity = with_velocity
+ self.NameMapping = name_mapping
+ self.eval_cfg = eval_cfg
+ self.sample_interval = sample_interval
+ self.past_frames = past_frames
+ self.future_frames = future_frames
+ self.map_root = map_root
+ self.map_file = map_file
+ self.point_cloud_range = np.array(point_cloud_range)
+ self.polyline_points_num = polyline_points_num
+ self.map_element_class = {'Broken':0, 'Solid':1, 'SolidSolid':2,'Center':3,'TrafficLight':4,'StopSign':5}
+ self.MAPCLASSES = list(self.map_element_class.keys())
+ self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+ self.map_eval_use_same_gt_sample_num_flag = True
+ self.map_ann_file = 'data/infos'
+ self.eval_cfg = eval_cfg
+ with open(self.map_file,'rb') as f:
+ self.map_infos = pickle.load(f)
+
+ def invert_pose(self, pose):
+ inv_pose = np.eye(4)
+ inv_pose[:3, :3] = np.transpose(pose[:3, :3])
+ inv_pose[:3, -1] = - inv_pose[:3, :3] @ pose[:3, -1]
+ return inv_pose
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ queue = []
+ index_list = list(range(index-self.queue_length*self.sample_interval, index,self.sample_interval))
+ random.shuffle(index_list)
+ index_list = sorted(index_list[1:])
+ index_list.append(index)
+ for i in index_list:
+ i = max(0, i)
+ input_dict = self.get_data_info(i)
+ if input_dict is None:
+ return None
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ gt_labels,gt_bboxes = self.get_map_info(index)
+ example['map_gt_labels_3d'] = DC(gt_labels, cpu_only=False)
+ example['map_gt_bboxes_3d'] = DC(gt_bboxes, cpu_only=True)
+
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ queue.append(example)
+ return self.union2one(queue)
+
+
+ def union2one(self, queue):
+ imgs_list = [each['img'].data for each in queue]
+ metas_map = {}
+ prev_scene_token = None
+ prev_pos = None
+ prev_angle = None
+ for i, each in enumerate(queue):
+ metas_map[i] = each['img_metas'].data
+ if metas_map[i]['folder'] != prev_scene_token:
+ metas_map[i]['prev_bev_exists'] = False
+ prev_scene_token = metas_map[i]['folder']
+ prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] = 0
+ metas_map[i]['can_bus'][-1] = 0
+ else:
+ metas_map[i]['prev_bev_exists'] = True
+ tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] -= prev_pos
+ metas_map[i]['can_bus'][-1] -= prev_angle
+ prev_pos = copy.deepcopy(tmp_pos)
+ prev_angle = copy.deepcopy(tmp_angle)
+ queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+ queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[-1]
+ return queue
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+
+ for i in range(len(info['gt_names'])):
+ if info['gt_names'][i] in self.NameMapping.keys():
+ info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+
+ input_dict = dict(
+ folder=info['folder'],
+ scene_token=info['folder'],
+ frame_idx=info['frame_idx'],
+ ego_yaw=np.nan_to_num(info['ego_yaw'],nan=np.pi/2),
+ ego_translation=info['ego_translation'],
+ sensors=info['sensors'],
+ world2lidar=info['sensors']['LIDAR_TOP']['world2lidar'],
+ gt_ids=info['gt_ids'],
+ gt_boxes=info['gt_boxes'],
+ gt_names=info['gt_names'],
+ ego_vel = info['ego_vel'],
+ ego_accel = info['ego_accel'],
+ ego_rotation_rate = info['ego_rotation_rate'],
+ npc2world = info['npc2world'],
+ timestamp=info['frame_idx']/10
+ )
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+ lidar2global = self.invert_pose(info['sensors']['LIDAR_TOP']['world2lidar'])
+ for sensor_type, cam_info in info['sensors'].items():
+ if not 'CAM' in sensor_type:
+ continue
+ image_paths.append(osp.join(self.data_root,cam_info['data_path']))
+ # obtain lidar to image transformation matrix
+ cam2ego = cam_info['cam2ego']
+ intrinsic = cam_info['intrinsic']
+ intrinsic_pad = np.eye(4)
+ intrinsic_pad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2cam = self.invert_pose(cam2ego) @ lidar2ego
+ lidar2img = intrinsic_pad @ lidar2cam
+ lidar2img_rts.append(lidar2img)
+ cam_intrinsics.append(intrinsic_pad)
+ lidar2cam_rts.append(lidar2cam)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam_intrinsic=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ l2g_r_mat=lidar2global[0:3,0:3],
+ l2g_t=lidar2global[0:3,3]
+
+ ))
+
+ #if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+ yaw = input_dict['ego_yaw']
+ rotation = list(Quaternion(axis=[0, 0, 1], radians=yaw))
+
+ if yaw < 0:
+ yaw += 2*np.pi
+ yaw_in_degree = yaw / np.pi * 180
+
+ can_bus = np.zeros(18)
+ can_bus[:3] = input_dict['ego_translation']
+ can_bus[3:7] = rotation
+ can_bus[7:10] = input_dict['ego_vel']
+ can_bus[10:13] = input_dict['ego_accel']
+ can_bus[13:16] = input_dict['ego_rotation_rate']
+ can_bus[16] = yaw
+ can_bus[17] = yaw_in_degree
+ input_dict['can_bus'] = can_bus
+ ego_lcf_feat = np.zeros(9)
+ ego_lcf_feat[0:2] = input_dict['ego_translation'][0:2]
+ ego_lcf_feat[2:4] = input_dict['ego_accel'][2:4]
+ ego_lcf_feat[4] = input_dict['ego_rotation_rate'][-1]
+ ego_lcf_feat[5] = info['ego_size'][1]
+ ego_lcf_feat[6] = info['ego_size'][0]
+ ego_lcf_feat[7] = np.sqrt(input_dict['ego_translation'][0]**2+input_dict['ego_translation'][1]**2)
+ ego_lcf_feat[8] = info['steer']
+ ego_his_trajs, ego_fut_trajs, ego_fut_masks, command = self.get_ego_trajs(index,self.sample_interval,self.past_frames,self.future_frames)
+ input_dict['ego_his_trajs'] = ego_his_trajs
+ input_dict['ego_fut_trajs'] = ego_fut_trajs
+ input_dict['ego_fut_masks'] = ego_fut_masks
+ input_dict['ego_fut_cmd'] = command
+ input_dict['ego_lcf_feat'] = ego_lcf_feat
+ input_dict['fut_valid_flag'] = (ego_fut_masks==1).all()
+
+ return input_dict
+
+
+ def get_map_info(self, index):
+
+ gt_masks = []
+ gt_labels = []
+ gt_bboxes = []
+
+ ann_info = self.data_infos[index]
+ town_name = ann_info['town_name']
+ map_info = self.map_infos[town_name]
+ lane_points = map_info['lane_points']
+ lane_sample_points = map_info['lane_sample_points']
+ lane_types = map_info['lane_types']
+ trigger_volumes_points = map_info['trigger_volumes_points']
+ trigger_volumes_sample_points = map_info['trigger_volumes_sample_points']
+ trigger_volumes_types = map_info['trigger_volumes_types']
+ world2lidar = np.array(ann_info['sensors']['LIDAR_TOP']['world2lidar'])
+ ego_xy = np.linalg.inv(world2lidar)[0:2,3]
+ max_distance = 50
+ chosed_idx = []
+
+ for idx in range(len(lane_sample_points)):
+ single_sample_points = lane_sample_points[idx]
+ distance = np.linalg.norm((single_sample_points[:,0:2]-ego_xy),axis=-1)
+ if np.min(distance) < max_distance:
+ chosed_idx.append(idx)
+
+ polylines = []
+ for idx in chosed_idx:
+ if not lane_types[idx] in self.map_element_class.keys():
+ continue
+ points = lane_points[idx]
+ points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+ points_in_lidar = (world2lidar @ points.T).T
+ mask = (points_in_lidar[:,0]>self.point_cloud_range[0]) & (points_in_lidar[:,0]self.point_cloud_range[1]) & (points_in_lidar[:,1] 1:
+ polylines.append(LineString(points_in_lidar_range))
+ gt_label = self.map_element_class[lane_types[idx]]
+ gt_labels.append(gt_label)
+
+ for idx in range(len(trigger_volumes_points)):
+ if not trigger_volumes_types[idx] in self.map_element_class.keys():
+ continue
+ points = trigger_volumes_points[idx]
+ points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+ points_in_lidar = (world2lidar @ points.T).T
+ mask = (points_in_lidar[:,0]>self.point_cloud_range[0]) & (points_in_lidar[:,0]self.point_cloud_range[1]) & (points_in_lidar[:,1]=len(self.data_infos):
+ break
+ adj_frame = self.data_infos[adj_idx]
+ if adj_frame['folder'] != cur_frame ['folder']:
+ break
+ world2lidar_ego_adj = adj_frame['sensors']['LIDAR_TOP']['world2lidar']
+ adj2cur_lidar = world2lidar_lidar_cur @ np.linalg.inv(world2lidar_ego_adj)
+ xy = adj2cur_lidar[0:2,3]
+ full_adj_track[j,0:2] = xy
+ full_adj_adj_mask[j] = 1
+ offset_track = full_adj_track[1:] - full_adj_track[:-1]
+ for j in range(past_frames-1,-1,-1):
+ if full_adj_adj_mask[j] == 0:
+ offset_track[j] = offset_track[j+1]
+ for j in range(past_frames,past_frames+future_frames,1):
+
+ if full_adj_adj_mask[j+1] == 0 :
+ offset_track[j] = 0
+ command = self.command2hot(cur_frame['command_near'])
+ return offset_track[:past_frames].copy(), offset_track[past_frames:].copy(), full_adj_adj_mask[-future_frames:].copy(), command
+
+ def command2hot(self,command,max_dim=6):
+ if command < 0:
+ command = 4
+ command -= 1
+ cmd_one_hot = np.zeros(max_dim)
+ cmd_one_hot[command] = 1
+ return cmd_one_hot
+
+ def get_box_attr_labels(self,idx,sample_rate,frames):
+
+
+ adj_idx_list = range(idx,idx+(frames+1)*sample_rate,sample_rate)
+ cur_frame = self.data_infos[idx]
+ cur_box_names = cur_frame['gt_names']
+ for i in range(len(cur_box_names)):
+ if cur_box_names[i] in self.NameMapping.keys():
+ cur_box_names[i] = self.NameMapping[cur_box_names[i]]
+ cur_boxes = cur_frame['gt_boxes'].copy()
+ box_ids = cur_frame['gt_ids']
+ future_track = np.zeros((len(box_ids),frames+1,2))
+ future_mask = np.zeros((len(box_ids),frames+1))
+ future_yaw = np.zeros((len(box_ids),frames+1))
+ gt_fut_goal = np.zeros((len(box_ids),1))
+ agent_lcf_feat = np.zeros((len(box_ids),9))
+ world2lidar_lidar_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+ for i in range(len(box_ids)):
+ agent_lcf_feat[i,0:2] = cur_boxes[i,0:2]
+ agent_lcf_feat[i,2] = cur_boxes[i,6]
+ agent_lcf_feat[i,3:5] = cur_boxes[i,7:]
+ agent_lcf_feat[i,5:8] = cur_boxes[i,3:6]
+ cur_box_name = cur_box_names[i]
+ if cur_box_name in self.CLASSES:
+ agent_lcf_feat[i, 8] = self.CLASSES.index(cur_box_name)
+ else:
+ agent_lcf_feat[i, 8] = -1
+
+ box_id = box_ids[i]
+ cur_box2lidar = world2lidar_lidar_cur @ cur_frame['npc2world'][i]
+ cur_xy = cur_box2lidar[0:2,3]
+ for j in range(len(adj_idx_list)):
+ adj_idx = adj_idx_list[j]
+ if adj_idx <0 or adj_idx>=len(self.data_infos):
+ break
+ adj_frame = self.data_infos[adj_idx]
+ if adj_frame['folder'] != cur_frame ['folder']:
+ break
+ if len(np.where(adj_frame['gt_ids']==box_id)[0])==0:
+ continue
+ assert len(np.where(adj_frame['gt_ids']==box_id)[0]) == 1 , np.where(adj_frame['gt_ids']==box_id)[0]
+ adj_idx = np.where(adj_frame['gt_ids']==box_id)[0][0]
+ adj_box2lidar = world2lidar_lidar_cur @ adj_frame['npc2world'][adj_idx]
+ adj_xy = adj_box2lidar[0:2,3]
+ future_track[i,j,:] = adj_xy
+ future_mask[i,j] = 1
+ future_yaw[i,j] = np.arctan2(adj_box2lidar[1,0],adj_box2lidar[0,0])
+
+ coord_diff = future_track[i,-1] - future_track[i,0]
+ if coord_diff.max() < 1.0: # static
+ gt_fut_goal[i] = 9
+ else:
+ box_mot_yaw = np.arctan2(coord_diff[1], coord_diff[0]) + np.pi
+ gt_fut_goal[i] = box_mot_yaw // (np.pi / 4) # 0-8: goal direction class
+
+ future_track_offset = future_track[:,1:,:] - future_track[:,:-1,:]
+ future_mask_offset = future_mask[:,1:]
+ future_track_offset[future_mask_offset==0] = 0
+ future_yaw_offset = future_yaw[:,1:] - future_yaw[:,:-1]
+ mask1 = np.where(future_yaw_offset>np.pi)
+ mask2 = np.where(future_yaw_offset<-np.pi)
+ future_yaw_offset[mask1] -=np.pi*2
+ future_yaw_offset[mask2] +=np.pi*2
+ attr_labels = np.concatenate([future_track_offset.reshape(-1,frames*2), future_mask_offset, gt_fut_goal, agent_lcf_feat, future_yaw_offset],axis=-1).astype(np.float32)
+ return attr_labels.copy()
+
+
+
+ def load_gt(self):
+ all_annotations = EvalBoxes()
+ for i in range(len(self.data_infos)):
+ sample_boxes = []
+ sample_data = self.data_infos[i]
+ gt_boxes = sample_data['gt_boxes']
+ for j in range(gt_boxes.shape[0]):
+ class_name = self.NameMapping[sample_data['gt_names'][j]]
+ if not class_name in self.eval_cfg['class_range'].keys():
+ continue
+ range_x, range_y = self.eval_cfg['class_range'][class_name]
+ if abs(gt_boxes[j,0]) > range_x or abs(gt_boxes[j,1]) > range_y:
+ continue
+ sample_boxes.append(DetectionBox(
+ sample_token=sample_data['folder']+'_'+str(sample_data['frame_idx']),
+ translation=gt_boxes[j,0:3],
+ size=gt_boxes[j,3:6],
+ rotation=list(Quaternion(axis=[0, 0, 1], radians=-gt_boxes[j,6]-np.pi/2)),
+ velocity=gt_boxes[j,7:9],
+ num_pts=int(sample_data['num_points'][j]),
+ detection_name=class_name,
+ detection_score=-1.0,
+ attribute_name=class_name
+ ))
+ all_annotations.add_boxes(sample_data['folder']+'_'+str(sample_data['frame_idx']), sample_boxes)
+ return all_annotations
+
+
+
+ def _format_gt(self):
+ gt_annos = []
+ print('Start to convert gt map format...')
+ # assert self.map_ann_file is not None
+ if (not os.path.exists(self.map_ann_file)) :
+ dataset_length = len(self)
+ prog_bar = mmcv.ProgressBar(dataset_length)
+ mapped_class_names = self.MAPCLASSES
+ for sample_id in range(dataset_length):
+ sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+ gt_anno = {}
+ gt_anno['sample_token'] = sample_token
+ # gt_sample_annos = []
+ gt_sample_dict = {}
+ gt_labels , gt_bboxes = self.get_map_info(sample_id)
+ gt_vecs = gt_bboxes.instance_list
+ gt_vec_list = []
+ for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+ name = mapped_class_names[gt_label]
+ anno = dict(
+ pts=np.array(list(gt_vec.coords)),
+ pts_num=len(list(gt_vec.coords)),
+ cls_name=name,
+ type=gt_label,
+ )
+ gt_vec_list.append(anno)
+ gt_anno['vectors']=gt_vec_list
+ gt_annos.append(gt_anno)
+
+ prog_bar.update()
+ nusc_submissions = {
+ 'GTs': gt_annos
+ }
+ print('\n GT anns writes to', self.map_ann_file)
+ dump(nusc_submissions, self.map_ann_file)
+ else:
+ print(f'{self.map_ann_file} exist, not update')
+
+
+ def _format_bbox(self, results, jsonfile_prefix=None, score_thresh=0.2):
+ """Convert the results to the standard format.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+
+ Returns:
+ str: Path of the output json file.
+ """
+
+ nusc_annos = {}
+ det_mapped_class_names = self.CLASSES
+ # assert self.map_ann_file is not None
+ map_pred_annos = {}
+ map_mapped_class_names = self.MAPCLASSES
+ plan_annos = {}
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(track_iter_progress(results)):
+ #pdb.set_trace()
+ annos = []
+ box3d = det['boxes_3d']
+ scores = det['scores_3d']
+ labels = det['labels_3d']
+ box_gravity_center = box3d.gravity_center
+ box_dims = box3d.dims
+ box_yaw = box3d.yaw.numpy()
+ box_yaw = -box_yaw - np.pi / 2
+ sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+ for i in range(len(box3d)):
+ #import pdb;pdb.set_trace()
+ if scores[i] < score_thresh:
+ continue
+ quat = list(Quaternion(axis=[0, 0, 1], radians=box_yaw[i]))
+ velocity = [box3d.tensor[i, 7].item(),box3d.tensor[i, 8].item()]
+ name = det_mapped_class_names[labels[i]]
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box_gravity_center[i].tolist(),
+ size=box_dims[i].tolist(),
+ rotation=quat,
+ velocity=velocity,
+ detection_name=name,
+ detection_score=scores[i].item(),
+ attribute_name=name)
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+ map_pred_anno = {}
+ vecs = output_to_vecs(det)
+ sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+ map_pred_anno['sample_token'] = sample_token
+ pred_vec_list=[]
+ for i, vec in enumerate(vecs):
+ name = map_mapped_class_names[vec['label']]
+ anno = dict(
+ # sample_token=sample_token,
+ pts=vec['pts'],
+ pts_num=len(vec['pts']),
+ cls_name=name,
+ type=vec['label'],
+ confidence_level=vec['score'])
+ pred_vec_list.append(anno)
+ # annos.append(nusc_anno)
+ # nusc_annos[sample_token] = annos
+ map_pred_anno['vectors'] = pred_vec_list
+ map_pred_annos[sample_token] = map_pred_anno
+
+ # NOTE: Eval on map is VERY SLOW for the first time(about 3 hours) because load map ground trurh is slow.
+ # So we do not eval map by default.
+ # if not os.path.exists(self.map_ann_file):
+ # self._format_gt()
+ # else:
+ # print(f'{self.map_ann_file} exist, not update')
+ # with open(self.map_ann_file,'r') as f:
+ # GT_anns = json.load(f)
+ # gt_annos = GT_anns['GTs']
+
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ 'map_results': map_pred_annos,
+ 'plan_results': plan_annos
+ # 'GTs': gt_annos
+ }
+
+ mmcv.mkdir_or_exist(jsonfile_prefix)
+
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def format_results(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ if isinstance(results, dict):
+ # print(f'results must be a list, but get dict, keys={results.keys()}')
+ # assert isinstance(results, list)
+ results = results['bbox_results']
+ assert isinstance(results, list)
+ # assert len(results) == len(self), (
+ # 'The length of results is not equal to the dataset len: {} != {}'.
+ # format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ # currently the output prediction results could be in two formats
+ # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+ # 2. list of dict('pts_bbox' or 'img_bbox':
+ # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+ # this is a workaround to enable evaluation of both formats on nuScenes
+ # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ if name == 'metric_results':
+ continue
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+ return result_files, tmp_dir
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ map_metric='chamfer',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+ detail = dict()
+ with open(result_path,'r') as f:
+ result_data = json.load(f)
+ pred_boxes = EvalBoxes.deserialize(result_data['results'], DetectionBox)
+ meta = result_data['meta']
+
+
+
+ gt_boxes = self.load_gt()
+
+ metric_data_list = DetectionMetricDataList()
+ for class_name in self.eval_cfg['class_names']:
+ for dist_th in self.eval_cfg['dist_ths']:
+ md = accumulate(gt_boxes, pred_boxes, class_name, center_distance, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+ metrics = DetectionMetrics(self.eval_cfg)
+
+ for class_name in self.eval_cfg['class_names']:
+ # Compute APs.
+ for dist_th in self.eval_cfg['dist_ths']:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.eval_cfg['min_recall'], self.eval_cfg['min_precision'])
+ metrics.add_label_ap(class_name, dist_th, ap)
+
+ # Compute TP metrics.
+ for metric_name in self.eval_cfg['tp_metrics']:
+ metric_data = metric_data_list[(class_name, self.eval_cfg['dist_th_tp'])]
+ tp = calc_tp(metric_data, self.eval_cfg['min_recall'], metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ metrics_summary = metrics.serialize()
+ metrics_summary['meta'] = meta.copy()
+ print('mAP: %.4f' % (metrics_summary['mean_ap']))
+ err_name_mapping = {
+ 'trans_err': 'mATE',
+ 'scale_err': 'mASE',
+ 'orient_err': 'mAOE',
+ 'vel_err': 'mAVE',
+ }
+ for tp_name, tp_val in metrics_summary['tp_errors'].items():
+ print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+ print('NDS: %.4f' % (metrics_summary['nd_score']))
+ #print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+ # Print per-class metrics.
+ print()
+ print('Per-class results:')
+ print('Object Class\tAP\tATE\tASE\tAOE\tAVE')
+ class_aps = metrics_summary['mean_dist_aps']
+ class_tps = metrics_summary['label_tp_errors']
+ for class_name in class_aps.keys():
+ print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+ % (class_name, class_aps[class_name],
+ class_tps[class_name]['trans_err'],
+ class_tps[class_name]['scale_err'],
+ class_tps[class_name]['orient_err'],
+ class_tps[class_name]['vel_err']))
+
+ detail = dict()
+ metric_prefix = 'bbox_NuScenes'
+ for name in self.eval_cfg['class_names']:
+ for k, v in metrics_summary['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics_summary['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics_summary['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,self.eval_cfg['err_name_maping'][k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics_summary['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics_summary['mean_ap']
+
+
+ # from mmcv.datasets.map_utils.mean_ap import eval_map
+ # from mmcv.datasets.map_utils.mean_ap import format_res_gt_by_classes
+ # result_path = osp.abspath(result_path)
+
+ # print('Formating results & gts by classes')
+ # pred_results = load(result_path)
+ # map_results = pred_results['map_results']
+ # gt_anns = load(self.map_ann_file)
+ # map_annotations = gt_anns['GTs']
+ # cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+ # map_results,
+ # map_annotations,
+ # cls_names=self.MAPCLASSES,
+ # num_pred_pts_per_instance=self.polyline_points_num,
+ # eval_use_same_gt_sample_num_flag=self.map_eval_use_same_gt_sample_num_flag,
+ # pc_range=self.point_cloud_range)
+ # map_metrics = map_metric if isinstance(map_metric, list) else [map_metric]
+ # allowed_metrics = ['chamfer', 'iou']
+ # for metric in map_metrics:
+ # if metric not in allowed_metrics:
+ # raise KeyError(f'metric {metric} is not supported')
+ # for metric in map_metrics:
+ # print('-*'*10+f'use metric:{metric}'+'-*'*10)
+ # if metric == 'chamfer':
+ # thresholds = [0.5,1.0,1.5]
+ # elif metric == 'iou':
+ # thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+ # cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+ # for i, thr in enumerate(thresholds):
+ # print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+ # mAP, cls_ap = eval_map(
+ # map_results,
+ # map_annotations,
+ # cls_gens,
+ # cls_gts,
+ # threshold=thr,
+ # cls_names=self.MAPCLASSES,
+ # logger=logger,
+ # num_pred_pts_per_instance=self.polyline_points_num,
+ # pc_range=self.point_cloud_range,
+ # metric=metric)
+ # for j in range(self.NUM_MAPCLASSES):
+ # cls_aps[i, j] = cls_ap[j]['ap']
+ # for i, name in enumerate(self.MAPCLASSES):
+ # print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+ # detail['NuscMap_{}/{}_AP'.format(metric,name)] = cls_aps.mean(0)[i]
+ # print('map: {}'.format(cls_aps.mean(0).mean()))
+ # detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+ # for i, name in enumerate(self.MAPCLASSES):
+ # for j, thr in enumerate(thresholds):
+ # if metric == 'chamfer':
+ # detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+ # elif metric == 'iou':
+ # if thr == 0.5 or thr == 0.75:
+ # detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+ return detail
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ map_metric='chamfer',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+ result_metric_names = ['EPA', 'ADE', 'FDE', 'MR']
+ motion_cls_names = ['car', 'pedestrian']
+ motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+ 'fp', 'ADE', 'FDE', 'MR']
+ all_metric_dict = {}
+ for met in motion_metric_names:
+ for cls in motion_cls_names:
+ all_metric_dict[met+'_'+cls] = 0.0
+ result_dict = {}
+ for met in result_metric_names:
+ for cls in motion_cls_names:
+ result_dict[met+'_'+cls] = 0.0
+
+ alpha = 0.5
+
+ for i in range(len(results)):
+ for key in all_metric_dict.keys():
+ all_metric_dict[key] += results[i]['metric_results'][key]
+
+ for cls in motion_cls_names:
+ result_dict['EPA_'+cls] = (all_metric_dict['hit_'+cls] - \
+ alpha * all_metric_dict['fp_'+cls]) / all_metric_dict['gt_'+cls]
+ result_dict['ADE_'+cls] = all_metric_dict['ADE_'+cls] / all_metric_dict['cnt_ade_'+cls]
+ result_dict['FDE_'+cls] = all_metric_dict['FDE_'+cls] / all_metric_dict['cnt_fde_'+cls]
+ result_dict['MR_'+cls] = all_metric_dict['MR_'+cls] / all_metric_dict['cnt_fde_'+cls]
+
+ print('\n')
+ print('-------------- Motion Prediction --------------')
+ for k, v in result_dict.items():
+ print(f'{k}: {v}')
+
+ # NOTE: print planning metric
+ print('\n')
+ print('-------------- Planning --------------')
+ metric_dict = None
+ num_valid = 0
+ for res in results:
+ if res['metric_results']['fut_valid_flag']:
+ num_valid += 1
+ else:
+ continue
+ if metric_dict is None:
+ metric_dict = copy.deepcopy(res['metric_results'])
+ else:
+ for k in res['metric_results'].keys():
+ metric_dict[k] += res['metric_results'][k]
+
+ for k in metric_dict:
+ metric_dict[k] = metric_dict[k] / num_valid
+ print("{}:{}".format(k, metric_dict[k]))
+
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+ if isinstance(result_files, dict):
+ results_dict = dict()
+ for name in result_names:
+ print('Evaluating bboxes of {}'.format(name))
+ ret_dict = self._evaluate_single(result_files[name], metric=metric, map_metric=map_metric)
+ results_dict.update(ret_dict)
+ elif isinstance(result_files, str):
+ results_dict = self._evaluate_single(result_files, metric=metric, map_metric=map_metric)
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+ return results_dict
+
+def output_to_nusc_box(detection):
+ """Convert the output to the box class in the nuScenes.
+
+ Args:
+ detection (dict): Detection results.
+
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+
+ Returns:
+ list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+ """
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+ trajs = detection['trajs_3d'].numpy()
+
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+ # TODO: check whether this is necessary
+ # with dir_offset & dir_limit in the head
+ box_yaw = -box_yaw - np.pi / 2
+
+ box_list = []
+ for i in range(len(box3d)):
+ quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ velocity = (*box3d.tensor[i, 7:9], 0.0)
+ # velo_val = np.linalg.norm(box3d[i, 7:9])
+ # velo_ori = box3d[i, 6]
+ # velocity = (
+ # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+ box = CustomNuscenesBox(
+ center=box_gravity_center[i],
+ size=box_dims[i],
+ orientation=quat,
+ fut_trajs=trajs[i],
+ label=labels[i],
+ score=scores[i],
+ velocity=velocity)
+ box_list.append(box)
+ return box_list
+
+
+def lidar_nusc_box_to_global(info,
+ boxes,
+ classes,
+ eval_configs,
+ eval_version='detection_cvpr_2019'):
+ """Convert the box from ego to global coordinate.
+
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ classes (list[str]): Mapped classes in the evaluation.
+ eval_configs (object): Evaluation configuration object.
+ eval_version (str): Evaluation version.
+ Default: 'detection_cvpr_2019'
+
+ Returns:
+ list: List of standard NuScenesBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ for box in boxes:
+ # Move box to ego vehicle coord system
+ box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+ box.translate(np.array(info['lidar2ego_translation']))
+ # filter det in ego.
+ cls_range_x_map = eval_configs.class_range_x
+ cls_range_y_map = eval_configs.class_range_y
+ x_distance, y_distance = box.center[0], box.center[1]
+ det_range_x = cls_range_x_map[classes[box.label]]
+ det_range_y = cls_range_y_map[classes[box.label]]
+ if abs(x_distance) > det_range_x or abs(y_distance) > det_range_y:
+ continue
+ # Move box to global coord system
+ box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+ box.translate(np.array(info['ego2global_translation']))
+ box_list.append(box)
+ return box_list
+
+def output_to_vecs(detection):
+ box3d = detection['map_boxes_3d'].numpy()
+ scores = detection['map_scores_3d'].numpy()
+ labels = detection['map_labels_3d'].numpy()
+ pts = detection['map_pts_3d'].numpy()
+
+ vec_list = []
+ # import pdb;pdb.set_trace()
+ for i in range(box3d.shape[0]):
+ vec = dict(
+ bbox = box3d[i], # xyxy
+ label=labels[i],
+ score=scores[i],
+ pts=pts[i],
+ )
+ vec_list.append(vec)
+ return vec_list
\ No newline at end of file
diff --git a/mmcv/datasets/__init__.py b/mmcv/datasets/__init__.py
new file mode 100644
index 0000000..a0093d3
--- /dev/null
+++ b/mmcv/datasets/__init__.py
@@ -0,0 +1,15 @@
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .custom_3d import Custom3DDataset
+from .custom import CustomDataset
+from .nuscenes_dataset import NuScenesDataset
+from .nuscenes_e2e_dataset import NuScenesE2EDataset
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+from .utils import replace_ImageToTensor
+from .custom_nuscenes_dataset_v2 import CustomNuScenesDatasetV2
+from .custom_nuscenes_dataset import CustomNuScenesDataset
+from .dd3d_nuscenes_dataset import DD3DNuscenesDataset
+from .lyft_dataset import LyftDataset
+from .B2D_dataset import B2D_Dataset
+from .B2D_e2e_dataset import B2D_E2E_Dataset
+from .nuscenes_vad_dataset import VADCustomNuScenesDataset
+from .B2D_vad_dataset import B2D_VAD_Dataset
\ No newline at end of file
diff --git a/mmcv/datasets/api_wrappers/__init__.py b/mmcv/datasets/api_wrappers/__init__.py
new file mode 100644
index 0000000..05f95c9
--- /dev/null
+++ b/mmcv/datasets/api_wrappers/__init__.py
@@ -0,0 +1,3 @@
+from .coco_api import COCO, COCOeval
+
+__all__ = ['COCO', 'COCOeval']
diff --git a/mmcv/datasets/api_wrappers/coco_api.py b/mmcv/datasets/api_wrappers/coco_api.py
new file mode 100644
index 0000000..57077f9
--- /dev/null
+++ b/mmcv/datasets/api_wrappers/coco_api.py
@@ -0,0 +1,46 @@
+# This file add snake case alias for coco api
+
+import warnings
+
+import pycocotools
+from pycocotools.coco import COCO as _COCO
+from pycocotools.cocoeval import COCOeval as _COCOeval
+
+
+class COCO(_COCO):
+ """This class is almost the same as official pycocotools package.
+
+ It implements some snake case function aliases. So that the COCO class has
+ the same interface as LVIS class.
+ """
+
+ def __init__(self, annotation_file=None):
+ if getattr(pycocotools, '__version__', '0') >= '12.0.2':
+ warnings.warn(
+ 'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"', # noqa: E501
+ UserWarning)
+ super().__init__(annotation_file=annotation_file)
+ self.img_ann_map = self.imgToAnns
+ self.cat_img_map = self.catToImgs
+
+ def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None):
+ return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd)
+
+ def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]):
+ return self.getCatIds(cat_names, sup_names, cat_ids)
+
+ def get_img_ids(self, img_ids=[], cat_ids=[]):
+ return self.getImgIds(img_ids, cat_ids)
+
+ def load_anns(self, ids):
+ return self.loadAnns(ids)
+
+ def load_cats(self, ids):
+ return self.loadCats(ids)
+
+ def load_imgs(self, ids):
+ return self.loadImgs(ids)
+
+
+# just for the ease of import
+COCOeval = _COCOeval
diff --git a/mmcv/datasets/builder.py b/mmcv/datasets/builder.py
new file mode 100644
index 0000000..7f527d6
--- /dev/null
+++ b/mmcv/datasets/builder.py
@@ -0,0 +1,204 @@
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.utils import Registry, build_from_cfg, get_dist_info
+from torch.utils.data import DataLoader
+
+# DATASETS = Registry('dataset')
+# PIPELINES = Registry('pipeline')
+# OBJECTSAMPLERS = Registry('Object sampler')
+
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+# from .dataset_wrappers import CBGSDataset, ClassBalancedDataset, ConcatDataset, RepeatDataset
+from .samplers.sampler import build_sampler
+
+if platform.system() != 'Windows':
+ # https://github.com/pytorch/pytorch/issues/973
+ import resource
+ rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+ hard_limit = rlimit[1]
+ soft_limit = min(4096, hard_limit)
+ resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+OBJECTSAMPLERS = Registry('Object sampler')
+
+
+
+def _concat_dataset(cfg, default_args=None):
+ from .dataset_wrappers import ConcatDataset
+ ann_files = cfg['ann_file']
+ img_prefixes = cfg.get('img_prefix', None)
+ seg_prefixes = cfg.get('seg_prefix', None)
+ proposal_files = cfg.get('proposal_file', None)
+ separate_eval = cfg.get('separate_eval', True)
+
+ datasets = []
+ num_dset = len(ann_files)
+ for i in range(num_dset):
+ data_cfg = copy.deepcopy(cfg)
+ # pop 'separate_eval' since it is not a valid key for common datasets.
+ if 'separate_eval' in data_cfg:
+ data_cfg.pop('separate_eval')
+ data_cfg['ann_file'] = ann_files[i]
+ if isinstance(img_prefixes, (list, tuple)):
+ data_cfg['img_prefix'] = img_prefixes[i]
+ if isinstance(seg_prefixes, (list, tuple)):
+ data_cfg['seg_prefix'] = seg_prefixes[i]
+ if isinstance(proposal_files, (list, tuple)):
+ data_cfg['proposal_file'] = proposal_files[i]
+ datasets.append(build_dataset(data_cfg, default_args))
+
+ return ConcatDataset(datasets, separate_eval)
+
+
+
+
+def build_dataset(cfg, default_args=None):
+ from mmcv.datasets.dataset_wrappers import CBGSDataset
+ from mmcv.datasets.dataset_wrappers import (ClassBalancedDataset,
+ ConcatDataset, RepeatDataset)
+ if isinstance(cfg, (list, tuple)):
+ dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+ elif cfg['type'] == 'ConcatDataset':
+ dataset = ConcatDataset(
+ [build_dataset(c, default_args) for c in cfg['datasets']],
+ cfg.get('separate_eval', True))
+ elif cfg['type'] == 'RepeatDataset':
+ dataset = RepeatDataset(
+ build_dataset(cfg['dataset'], default_args), cfg['times'])
+ elif cfg['type'] == 'ClassBalancedDataset':
+ dataset = ClassBalancedDataset(
+ build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+ elif cfg['type'] == 'CBGSDataset':
+ dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args))
+ elif isinstance(cfg.get('ann_file'), (list, tuple)):
+ dataset = _concat_dataset(cfg, default_args)
+ else:
+ dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+ return dataset
+
+
+def build_dataloader(dataset,
+ samples_per_gpu,
+ workers_per_gpu,
+ num_gpus=1,
+ dist=True,
+ shuffle=True,
+ seed=None,
+ shuffler_sampler=None,
+ nonshuffler_sampler=None,
+ **kwargs):
+ """Build PyTorch DataLoader.
+ In distributed training, each GPU/process has a dataloader.
+ In non-distributed training, there is only one dataloader for all GPUs.
+ Args:
+ dataset (Dataset): A PyTorch dataset.
+ samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+ batch size of each GPU.
+ workers_per_gpu (int): How many subprocesses to use for data loading
+ for each GPU.
+ num_gpus (int): Number of GPUs. Only used in non-distributed training.
+ dist (bool): Distributed training/test or not. Default: True.
+ shuffle (bool): Whether to shuffle the data at every epoch.
+ Default: True.
+ kwargs: any keyword argument to be used to initialize DataLoader
+ Returns:
+ DataLoader: A PyTorch dataloader.
+ """
+ rank, world_size = get_dist_info()
+ if dist:
+ # DistributedGroupSampler will definitely shuffle the data to satisfy
+ # that images on each GPU are in the same group
+ if shuffle:
+ sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
+ dict(
+ dataset=dataset,
+ samples_per_gpu=samples_per_gpu,
+ num_replicas=world_size,
+ rank=rank,
+ seed=seed)
+ )
+
+ else:
+ sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
+ dict(
+ dataset=dataset,
+ num_replicas=world_size,
+ rank=rank,
+ shuffle=shuffle,
+ seed=seed)
+ )
+
+ batch_size = samples_per_gpu
+ num_workers = workers_per_gpu
+ else:
+ # assert False, 'not support in bevformer'
+ print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
+ sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+ batch_size = num_gpus * samples_per_gpu
+ num_workers = num_gpus * workers_per_gpu
+
+ init_fn = partial(
+ worker_init_fn, num_workers=num_workers, rank=rank,
+ seed=seed) if seed is not None else None
+ data_loader = DataLoader(
+ dataset,
+ batch_size=batch_size,
+ sampler=sampler,
+ num_workers=num_workers,
+ collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+ pin_memory=False,
+ worker_init_fn=init_fn,
+ **kwargs)
+
+ return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+ # The seed of each worker equals to
+ # num_worker * rank + worker_id + user_seed
+ worker_seed = num_workers * rank + worker_id + seed
+ np.random.seed(worker_seed)
+ random.seed(worker_seed)
+
+
+if platform.system() != 'Windows':
+ # https://github.com/pytorch/pytorch/issues/973
+ import resource
+ rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+ base_soft_limit = rlimit[0]
+ hard_limit = rlimit[1]
+ soft_limit = min(max(4096, base_soft_limit), hard_limit)
+ resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+def custom_build_dataset(cfg, default_args=None):
+ from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+ from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
+ ConcatDataset, RepeatDataset)
+ if isinstance(cfg, (list, tuple)):
+ dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
+ elif cfg['type'] == 'ConcatDataset':
+ dataset = ConcatDataset(
+ [custom_build_dataset(c, default_args) for c in cfg['datasets']],
+ cfg.get('separate_eval', True))
+ elif cfg['type'] == 'RepeatDataset':
+ dataset = RepeatDataset(
+ custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
+ elif cfg['type'] == 'ClassBalancedDataset':
+ dataset = ClassBalancedDataset(
+ custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+ elif cfg['type'] == 'CBGSDataset':
+ dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
+ elif isinstance(cfg.get('ann_file'), (list, tuple)):
+ dataset = _concat_dataset(cfg, default_args)
+ else:
+ dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+ return dataset
\ No newline at end of file
diff --git a/mmcv/datasets/coco.py b/mmcv/datasets/coco.py
new file mode 100644
index 0000000..6d2f0c1
--- /dev/null
+++ b/mmcv/datasets/coco.py
@@ -0,0 +1,558 @@
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+from mmcv.fileio.io import load, dump
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from mmcv.core import eval_recalls
+from .api_wrappers import COCO, COCOeval
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class CocoDataset(CustomDataset):
+
+ CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+ 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+ 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+ 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+ 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+ 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+ 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+ 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+ 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+ 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+ 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+ 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+ 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+ 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+ def load_annotations(self, ann_file):
+ """Load annotation from COCO style annotation file.
+
+ Args:
+ ann_file (str): Path of annotation file.
+
+ Returns:
+ list[dict]: Annotation info from COCO api.
+ """
+
+ self.coco = COCO(ann_file)
+ # The order of returned `cat_ids` will not
+ # change with the order of the CLASSES
+ self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+
+ self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+ self.img_ids = self.coco.get_img_ids()
+ data_infos = []
+ total_ann_ids = []
+ for i in self.img_ids:
+ info = self.coco.load_imgs([i])[0]
+ info['filename'] = info['file_name']
+ data_infos.append(info)
+ ann_ids = self.coco.get_ann_ids(img_ids=[i])
+ total_ann_ids.extend(ann_ids)
+ assert len(set(total_ann_ids)) == len(
+ total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
+ return data_infos
+
+ def get_ann_info(self, idx):
+ """Get COCO annotation by index.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ dict: Annotation info of specified index.
+ """
+
+ img_id = self.data_infos[idx]['id']
+ ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+ ann_info = self.coco.load_anns(ann_ids)
+ return self._parse_ann_info(self.data_infos[idx], ann_info)
+
+ def get_cat_ids(self, idx):
+ """Get COCO category ids by index.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ list[int]: All categories in the image of specified index.
+ """
+
+ img_id = self.data_infos[idx]['id']
+ ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+ ann_info = self.coco.load_anns(ann_ids)
+ return [ann['category_id'] for ann in ann_info]
+
+ def _filter_imgs(self, min_size=32):
+ """Filter images too small or without ground truths."""
+ valid_inds = []
+ # obtain images that contain annotation
+ ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
+ # obtain images that contain annotations of the required categories
+ ids_in_cat = set()
+ for i, class_id in enumerate(self.cat_ids):
+ ids_in_cat |= set(self.coco.cat_img_map[class_id])
+ # merge the image id sets of the two conditions and use the merged set
+ # to filter out images if self.filter_empty_gt=True
+ ids_in_cat &= ids_with_ann
+
+ valid_img_ids = []
+ for i, img_info in enumerate(self.data_infos):
+ img_id = self.img_ids[i]
+ if self.filter_empty_gt and img_id not in ids_in_cat:
+ continue
+ if min(img_info['width'], img_info['height']) >= min_size:
+ valid_inds.append(i)
+ valid_img_ids.append(img_id)
+ self.img_ids = valid_img_ids
+ return valid_inds
+
+ def _parse_ann_info(self, img_info, ann_info):
+ """Parse bbox and mask annotation.
+
+ Args:
+ ann_info (list[dict]): Annotation info of an image.
+ with_mask (bool): Whether to parse mask annotations.
+
+ Returns:
+ dict: A dict containing the following keys: bboxes, bboxes_ignore,\
+ labels, masks, seg_map. "masks" are raw annotations and not \
+ decoded into binary masks.
+ """
+ gt_bboxes = []
+ gt_labels = []
+ gt_bboxes_ignore = []
+ gt_masks_ann = []
+ for i, ann in enumerate(ann_info):
+ if ann.get('ignore', False):
+ continue
+ x1, y1, w, h = ann['bbox']
+ inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+ inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+ if inter_w * inter_h == 0:
+ continue
+ if ann['area'] <= 0 or w < 1 or h < 1:
+ continue
+ if ann['category_id'] not in self.cat_ids:
+ continue
+ bbox = [x1, y1, x1 + w, y1 + h]
+ if ann.get('iscrowd', False):
+ gt_bboxes_ignore.append(bbox)
+ else:
+ gt_bboxes.append(bbox)
+ gt_labels.append(self.cat2label[ann['category_id']])
+ gt_masks_ann.append(ann.get('segmentation', None))
+
+ if gt_bboxes:
+ gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+ gt_labels = np.array(gt_labels, dtype=np.int64)
+ else:
+ gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+ gt_labels = np.array([], dtype=np.int64)
+
+ if gt_bboxes_ignore:
+ gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+ else:
+ gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+ seg_map = img_info['filename'].replace('jpg', 'png')
+
+ ann = dict(
+ bboxes=gt_bboxes,
+ labels=gt_labels,
+ bboxes_ignore=gt_bboxes_ignore,
+ masks=gt_masks_ann,
+ seg_map=seg_map)
+
+ return ann
+
+ def xyxy2xywh(self, bbox):
+ """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+ evaluation.
+
+ Args:
+ bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+ ``xyxy`` order.
+
+ Returns:
+ list[float]: The converted bounding boxes, in ``xywh`` order.
+ """
+
+ _bbox = bbox.tolist()
+ return [
+ _bbox[0],
+ _bbox[1],
+ _bbox[2] - _bbox[0],
+ _bbox[3] - _bbox[1],
+ ]
+
+ def _proposal2json(self, results):
+ """Convert proposal results to COCO json style."""
+ json_results = []
+ for idx in range(len(self)):
+ img_id = self.img_ids[idx]
+ bboxes = results[idx]
+ for i in range(bboxes.shape[0]):
+ data = dict()
+ data['image_id'] = img_id
+ data['bbox'] = self.xyxy2xywh(bboxes[i])
+ data['score'] = float(bboxes[i][4])
+ data['category_id'] = 1
+ json_results.append(data)
+ return json_results
+
+ def _det2json(self, results):
+ """Convert detection results to COCO json style."""
+ json_results = []
+ for idx in range(len(self)):
+ img_id = self.img_ids[idx]
+ result = results[idx]
+ for label in range(len(result)):
+ bboxes = result[label]
+ for i in range(bboxes.shape[0]):
+ data = dict()
+ data['image_id'] = img_id
+ data['bbox'] = self.xyxy2xywh(bboxes[i])
+ data['score'] = float(bboxes[i][4])
+ data['category_id'] = self.cat_ids[label]
+ json_results.append(data)
+ return json_results
+
+ def _segm2json(self, results):
+ """Convert instance segmentation results to COCO json style."""
+ bbox_json_results = []
+ segm_json_results = []
+ for idx in range(len(self)):
+ img_id = self.img_ids[idx]
+ det, seg = results[idx]
+ for label in range(len(det)):
+ # bbox results
+ bboxes = det[label]
+ for i in range(bboxes.shape[0]):
+ data = dict()
+ data['image_id'] = img_id
+ data['bbox'] = self.xyxy2xywh(bboxes[i])
+ data['score'] = float(bboxes[i][4])
+ data['category_id'] = self.cat_ids[label]
+ bbox_json_results.append(data)
+
+ # segm results
+ # some detectors use different scores for bbox and mask
+ if isinstance(seg, tuple):
+ segms = seg[0][label]
+ mask_score = seg[1][label]
+ else:
+ segms = seg[label]
+ mask_score = [bbox[4] for bbox in bboxes]
+ for i in range(bboxes.shape[0]):
+ data = dict()
+ data['image_id'] = img_id
+ data['bbox'] = self.xyxy2xywh(bboxes[i])
+ data['score'] = float(mask_score[i])
+ data['category_id'] = self.cat_ids[label]
+ if isinstance(segms[i]['counts'], bytes):
+ segms[i]['counts'] = segms[i]['counts'].decode()
+ data['segmentation'] = segms[i]
+ segm_json_results.append(data)
+ return bbox_json_results, segm_json_results
+
+ def results2json(self, results, outfile_prefix):
+ """Dump the detection results to a COCO style json file.
+
+ There are 3 types of results: proposals, bbox predictions, mask
+ predictions, and they have different data types. This method will
+ automatically recognize the type, and dump them to json files.
+
+ Args:
+ results (list[list | tuple | ndarray]): Testing results of the
+ dataset.
+ outfile_prefix (str): The filename prefix of the json files. If the
+ prefix is "somepath/xxx", the json files will be named
+ "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+ "somepath/xxx.proposal.json".
+
+ Returns:
+ dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
+ values are corresponding filenames.
+ """
+ result_files = dict()
+ if isinstance(results[0], list):
+ json_results = self._det2json(results)
+ result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+ result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+ dump(json_results, result_files['bbox'])
+ elif isinstance(results[0], tuple):
+ json_results = self._segm2json(results)
+ result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+ result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+ result_files['segm'] = f'{outfile_prefix}.segm.json'
+ dump(json_results[0], result_files['bbox'])
+ dump(json_results[1], result_files['segm'])
+ elif isinstance(results[0], np.ndarray):
+ json_results = self._proposal2json(results)
+ result_files['proposal'] = f'{outfile_prefix}.proposal.json'
+ dump(json_results, result_files['proposal'])
+ else:
+ raise TypeError('invalid type of results')
+ return result_files
+
+ def fast_eval_recall(self, results, proposal_nums, iou_thrs, logger=None):
+ gt_bboxes = []
+ for i in range(len(self.img_ids)):
+ ann_ids = self.coco.get_ann_ids(img_ids=self.img_ids[i])
+ ann_info = self.coco.load_anns(ann_ids)
+ if len(ann_info) == 0:
+ gt_bboxes.append(np.zeros((0, 4)))
+ continue
+ bboxes = []
+ for ann in ann_info:
+ if ann.get('ignore', False) or ann['iscrowd']:
+ continue
+ x1, y1, w, h = ann['bbox']
+ bboxes.append([x1, y1, x1 + w, y1 + h])
+ bboxes = np.array(bboxes, dtype=np.float32)
+ if bboxes.shape[0] == 0:
+ bboxes = np.zeros((0, 4))
+ gt_bboxes.append(bboxes)
+
+ recalls = eval_recalls(
+ gt_bboxes, results, proposal_nums, iou_thrs, logger=logger)
+ ar = recalls.mean(axis=1)
+ return ar
+
+ def format_results(self, results, jsonfile_prefix=None, **kwargs):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[tuple | numpy.ndarray]): Testing results of the
+ dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: (result_files, tmp_dir), result_files is a dict containing \
+ the json filepaths, tmp_dir is the temporal directory created \
+ for saving json files when jsonfile_prefix is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ assert len(results) == len(self), (
+ 'The length of results is not equal to the dataset len: {} != {}'.
+ format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+ result_files = self.results2json(results, jsonfile_prefix)
+ return result_files, tmp_dir
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ classwise=False,
+ proposal_nums=(100, 300, 1000),
+ iou_thrs=None,
+ metric_items=None):
+ """Evaluation in COCO protocol.
+
+ Args:
+ results (list[list | tuple]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated. Options are
+ 'bbox', 'segm', 'proposal', 'proposal_fast'.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ classwise (bool): Whether to evaluating the AP for each class.
+ proposal_nums (Sequence[int]): Proposal number used for evaluating
+ recalls, such as recall@100, recall@1000.
+ Default: (100, 300, 1000).
+ iou_thrs (Sequence[float], optional): IoU threshold used for
+ evaluating recalls/mAPs. If set to a list, the average of all
+ IoUs will also be computed. If not specified, [0.50, 0.55,
+ 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
+ Default: None.
+ metric_items (list[str] | str, optional): Metric items that will
+ be returned. If not specified, ``['AR@100', 'AR@300',
+ 'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
+ used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
+ 'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
+ ``metric=='bbox' or metric=='segm'``.
+
+ Returns:
+ dict[str, float]: COCO style evaluation metric.
+ """
+
+ metrics = metric if isinstance(metric, list) else [metric]
+ allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+ for metric in metrics:
+ if metric not in allowed_metrics:
+ raise KeyError(f'metric {metric} is not supported')
+ if iou_thrs is None:
+ iou_thrs = np.linspace(
+ .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+ if metric_items is not None:
+ if not isinstance(metric_items, list):
+ metric_items = [metric_items]
+
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+ eval_results = OrderedDict()
+ cocoGt = self.coco
+ for metric in metrics:
+ msg = f'Evaluating {metric}...'
+ if logger is None:
+ msg = '\n' + msg
+ print_log(msg, logger=logger)
+
+ if metric == 'proposal_fast':
+ ar = self.fast_eval_recall(
+ results, proposal_nums, iou_thrs, logger='silent')
+ log_msg = []
+ for i, num in enumerate(proposal_nums):
+ eval_results[f'AR@{num}'] = ar[i]
+ log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+ log_msg = ''.join(log_msg)
+ print_log(log_msg, logger=logger)
+ continue
+
+ iou_type = 'bbox' if metric == 'proposal' else metric
+ if metric not in result_files:
+ raise KeyError(f'{metric} is not in results')
+ try:
+ predictions = load(result_files[metric])
+ if iou_type == 'segm':
+ # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa
+ # When evaluating mask AP, if the results contain bbox,
+ # cocoapi will use the box area instead of the mask area
+ # for calculating the instance area. Though the overall AP
+ # is not affected, this leads to different
+ # small/medium/large mask AP results.
+ for x in predictions:
+ x.pop('bbox')
+ warnings.simplefilter('once')
+ warnings.warn(
+ 'The key "bbox" is deleted for more accurate mask AP '
+ 'of small/medium/large instances since v2.12.0. This '
+ 'does not change the overall mAP calculation.',
+ UserWarning)
+ cocoDt = cocoGt.loadRes(predictions)
+ except IndexError:
+ print_log(
+ 'The testing results of the whole dataset is empty.',
+ logger=logger,
+ level=logging.ERROR)
+ break
+
+ cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+ cocoEval.params.catIds = self.cat_ids
+ cocoEval.params.imgIds = self.img_ids
+ cocoEval.params.maxDets = list(proposal_nums)
+ cocoEval.params.iouThrs = iou_thrs
+ # mapping of cocoEval.stats
+ coco_metric_names = {
+ 'mAP': 0,
+ 'mAP_50': 1,
+ 'mAP_75': 2,
+ 'mAP_s': 3,
+ 'mAP_m': 4,
+ 'mAP_l': 5,
+ 'AR@100': 6,
+ 'AR@300': 7,
+ 'AR@1000': 8,
+ 'AR_s@1000': 9,
+ 'AR_m@1000': 10,
+ 'AR_l@1000': 11
+ }
+ if metric_items is not None:
+ for metric_item in metric_items:
+ if metric_item not in coco_metric_names:
+ raise KeyError(
+ f'metric item {metric_item} is not supported')
+
+ if metric == 'proposal':
+ cocoEval.params.useCats = 0
+ cocoEval.evaluate()
+ cocoEval.accumulate()
+ cocoEval.summarize()
+ if metric_items is None:
+ metric_items = [
+ 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+ 'AR_m@1000', 'AR_l@1000'
+ ]
+
+ for item in metric_items:
+ val = float(
+ f'{cocoEval.stats[coco_metric_names[item]]:.3f}')
+ eval_results[item] = val
+ else:
+ cocoEval.evaluate()
+ cocoEval.accumulate()
+ cocoEval.summarize()
+ if classwise: # Compute per-category AP
+ # Compute per-category AP
+ # from https://github.com/facebookresearch/detectron2/
+ precisions = cocoEval.eval['precision']
+ # precision: (iou, recall, cls, area range, max dets)
+ assert len(self.cat_ids) == precisions.shape[2]
+
+ results_per_category = []
+ for idx, catId in enumerate(self.cat_ids):
+ # area range index 0: all area ranges
+ # max dets index -1: typically 100 per image
+ nm = self.coco.loadCats(catId)[0]
+ precision = precisions[:, :, idx, 0, -1]
+ precision = precision[precision > -1]
+ if precision.size:
+ ap = np.mean(precision)
+ else:
+ ap = float('nan')
+ results_per_category.append(
+ (f'{nm["name"]}', f'{float(ap):0.3f}'))
+
+ num_columns = min(6, len(results_per_category) * 2)
+ results_flatten = list(
+ itertools.chain(*results_per_category))
+ headers = ['category', 'AP'] * (num_columns // 2)
+ results_2d = itertools.zip_longest(*[
+ results_flatten[i::num_columns]
+ for i in range(num_columns)
+ ])
+ table_data = [headers]
+ table_data += [result for result in results_2d]
+ table = AsciiTable(table_data)
+ print_log('\n' + table.table, logger=logger)
+
+ if metric_items is None:
+ metric_items = [
+ 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+ ]
+
+ for metric_item in metric_items:
+ key = f'{metric}_{metric_item}'
+ val = float(
+ f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}'
+ )
+ eval_results[key] = val
+ ap = cocoEval.stats[:6]
+ eval_results[f'{metric}_mAP_copypaste'] = (
+ f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+ f'{ap[4]:.3f} {ap[5]:.3f}')
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+ return eval_results
diff --git a/mmcv/datasets/custom.py b/mmcv/datasets/custom.py
new file mode 100644
index 0000000..4cd8a1d
--- /dev/null
+++ b/mmcv/datasets/custom.py
@@ -0,0 +1,362 @@
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv.utils import print_log
+from mmcv.fileio.io import load
+from mmcv.fileio.parse import list_from_file
+from terminaltables import AsciiTable
+from torch.utils.data import Dataset
+
+from mmcv.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class CustomDataset(Dataset):
+ """Custom dataset for detection.
+
+ The annotation format is shown as follows. The `ann` field is optional for
+ testing.
+
+ .. code-block:: none
+
+ [
+ {
+ 'filename': 'a.jpg',
+ 'width': 1280,
+ 'height': 720,
+ 'ann': {
+ 'bboxes': (n, 4) in (x1, y1, x2, y2) order.
+ 'labels': (n, ),
+ 'bboxes_ignore': (k, 4), (optional field)
+ 'labels_ignore': (k, 4) (optional field)
+ }
+ },
+ ...
+ ]
+
+ Args:
+ ann_file (str): Annotation file path.
+ pipeline (list[dict]): Processing pipeline.
+ classes (str | Sequence[str], optional): Specify classes to load.
+ If is None, ``cls.CLASSES`` will be used. Default: None.
+ data_root (str, optional): Data root for ``ann_file``,
+ ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+ test_mode (bool, optional): If set True, annotation will not be loaded.
+ filter_empty_gt (bool, optional): If set true, images without bounding
+ boxes of the dataset's classes will be filtered out. This option
+ only works when `test_mode=False`, i.e., we never filter images
+ during tests.
+ """
+
+ CLASSES = None
+
+ def __init__(self,
+ ann_file,
+ pipeline,
+ classes=None,
+ data_root=None,
+ img_prefix='',
+ seg_prefix=None,
+ proposal_file=None,
+ test_mode=False,
+ filter_empty_gt=True):
+ self.ann_file = ann_file
+ self.data_root = data_root
+ self.img_prefix = img_prefix
+ self.seg_prefix = seg_prefix
+ self.proposal_file = proposal_file
+ self.test_mode = test_mode
+ self.filter_empty_gt = filter_empty_gt
+ self.CLASSES = self.get_classes(classes)
+
+ # join paths if data_root is specified
+ if self.data_root is not None:
+ if not osp.isabs(self.ann_file):
+ self.ann_file = osp.join(self.data_root, self.ann_file)
+ if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
+ self.img_prefix = osp.join(self.data_root, self.img_prefix)
+ if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
+ self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
+ if not (self.proposal_file is None
+ or osp.isabs(self.proposal_file)):
+ self.proposal_file = osp.join(self.data_root,
+ self.proposal_file)
+ # load annotations (and proposals)
+ self.data_infos = self.load_annotations(self.ann_file)
+
+ if self.proposal_file is not None:
+ self.proposals = self.load_proposals(self.proposal_file)
+ else:
+ self.proposals = None
+
+ # filter images too small and containing no annotations
+ if not test_mode:
+ valid_inds = self._filter_imgs()
+ self.data_infos = [self.data_infos[i] for i in valid_inds]
+ if self.proposals is not None:
+ self.proposals = [self.proposals[i] for i in valid_inds]
+ # set group flag for the sampler
+ self._set_group_flag()
+
+ # processing pipeline
+ self.pipeline = Compose(pipeline)
+
+ def __len__(self):
+ """Total number of samples of data."""
+ return len(self.data_infos)
+
+ def load_annotations(self, ann_file):
+ """Load annotation from annotation file."""
+ return load(ann_file)
+
+ def load_proposals(self, proposal_file):
+ """Load proposal from proposal file."""
+ return load(proposal_file)
+
+ def get_ann_info(self, idx):
+ """Get annotation by index.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ dict: Annotation info of specified index.
+ """
+
+ return self.data_infos[idx]['ann']
+
+ def get_cat_ids(self, idx):
+ """Get category ids by index.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ list[int]: All categories in the image of specified index.
+ """
+
+ return self.data_infos[idx]['ann']['labels'].astype(np.int).tolist()
+
+ def pre_pipeline(self, results):
+ """Prepare results dict for pipeline."""
+ results['img_prefix'] = self.img_prefix
+ results['seg_prefix'] = self.seg_prefix
+ results['proposal_file'] = self.proposal_file
+ results['bbox_fields'] = []
+ results['mask_fields'] = []
+ results['seg_fields'] = []
+
+ def _filter_imgs(self, min_size=32):
+ """Filter images too small."""
+ if self.filter_empty_gt:
+ warnings.warn(
+ 'CustomDataset does not support filtering empty gt images.')
+ valid_inds = []
+ for i, img_info in enumerate(self.data_infos):
+ if min(img_info['width'], img_info['height']) >= min_size:
+ valid_inds.append(i)
+ return valid_inds
+
+ def _set_group_flag(self):
+ """Set flag according to image aspect ratio.
+
+ Images with aspect ratio greater than 1 will be set as group 1,
+ otherwise group 0.
+ """
+ self.flag = np.zeros(len(self), dtype=np.uint8)
+ for i in range(len(self)):
+ img_info = self.data_infos[i]
+ if img_info['width'] / img_info['height'] > 1:
+ self.flag[i] = 1
+
+ def _rand_another(self, idx):
+ """Get another random index from the same group as the given index."""
+ pool = np.where(self.flag == self.flag[idx])[0]
+ return np.random.choice(pool)
+
+ def __getitem__(self, idx):
+ """Get training/test data after pipeline.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ dict: Training/test data (with annotation if `test_mode` is set \
+ True).
+ """
+
+ if self.test_mode:
+ return self.prepare_test_img(idx)
+ while True:
+ data = self.prepare_train_img(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def prepare_train_img(self, idx):
+ """Get training data and annotations after pipeline.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ dict: Training data and annotation after pipeline with new keys \
+ introduced by pipeline.
+ """
+
+ img_info = self.data_infos[idx]
+ ann_info = self.get_ann_info(idx)
+ results = dict(img_info=img_info, ann_info=ann_info)
+ if self.proposals is not None:
+ results['proposals'] = self.proposals[idx]
+ self.pre_pipeline(results)
+ return self.pipeline(results)
+
+ def prepare_test_img(self, idx):
+ """Get testing data after pipeline.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ dict: Testing data after pipeline with new keys introduced by \
+ pipeline.
+ """
+
+ img_info = self.data_infos[idx]
+ results = dict(img_info=img_info)
+ if self.proposals is not None:
+ results['proposals'] = self.proposals[idx]
+ self.pre_pipeline(results)
+ return self.pipeline(results)
+
+ @classmethod
+ def get_classes(cls, classes=None):
+ """Get class names of current dataset.
+
+ Args:
+ classes (Sequence[str] | str | None): If classes is None, use
+ default CLASSES defined by builtin dataset. If classes is a
+ string, take it as a file name. The file contains the name of
+ classes where each line contains one class name. If classes is
+ a tuple or list, override the CLASSES defined by the dataset.
+
+ Returns:
+ tuple[str] or list[str]: Names of categories of the dataset.
+ """
+ if classes is None:
+ return cls.CLASSES
+
+ if isinstance(classes, str):
+ # take it as a file path
+ class_names = list_from_file(classes)
+ elif isinstance(classes, (tuple, list)):
+ class_names = classes
+ else:
+ raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+ return class_names
+
+ def format_results(self, results, **kwargs):
+ """Place holder to format result to dataset specific output."""
+
+ def evaluate(self,
+ results,
+ metric='mAP',
+ logger=None,
+ proposal_nums=(100, 300, 1000),
+ iou_thr=0.5,
+ scale_ranges=None):
+ """Evaluate the dataset.
+
+ Args:
+ results (list): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | None | str): Logger used for printing
+ related information during evaluation. Default: None.
+ proposal_nums (Sequence[int]): Proposal number used for evaluating
+ recalls, such as recall@100, recall@1000.
+ Default: (100, 300, 1000).
+ iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+ scale_ranges (list[tuple] | None): Scale ranges for evaluating mAP.
+ Default: None.
+ """
+
+ if not isinstance(metric, str):
+ assert len(metric) == 1
+ metric = metric[0]
+ allowed_metrics = ['mAP', 'recall']
+ if metric not in allowed_metrics:
+ raise KeyError(f'metric {metric} is not supported')
+ annotations = [self.get_ann_info(i) for i in range(len(self))]
+ eval_results = OrderedDict()
+ iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+ if metric == 'mAP':
+ assert isinstance(iou_thrs, list)
+ mean_aps = []
+ for iou_thr in iou_thrs:
+ print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+ mean_ap, _ = eval_map(
+ results,
+ annotations,
+ scale_ranges=scale_ranges,
+ iou_thr=iou_thr,
+ dataset=self.CLASSES,
+ logger=logger)
+ mean_aps.append(mean_ap)
+ eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+ eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+ elif metric == 'recall':
+ gt_bboxes = [ann['bboxes'] for ann in annotations]
+ recalls = eval_recalls(
+ gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
+ for i, num in enumerate(proposal_nums):
+ for j, iou in enumerate(iou_thrs):
+ eval_results[f'recall@{num}@{iou}'] = recalls[i, j]
+ if recalls.shape[1] > 1:
+ ar = recalls.mean(axis=1)
+ for i, num in enumerate(proposal_nums):
+ eval_results[f'AR@{num}'] = ar[i]
+ return eval_results
+
+ def __repr__(self):
+ """Print the number of instance number."""
+ dataset_type = 'Test' if self.test_mode else 'Train'
+ result = (f'\n{self.__class__.__name__} {dataset_type} dataset '
+ f'with number of images {len(self)}, '
+ f'and instance counts: \n')
+ if self.CLASSES is None:
+ result += 'Category names are not provided. \n'
+ return result
+ instance_count = np.zeros(len(self.CLASSES) + 1).astype(int)
+ # count the instance number in each image
+ for idx in range(len(self)):
+ label = self.get_ann_info(idx)['labels']
+ unique, counts = np.unique(label, return_counts=True)
+ if len(unique) > 0:
+ # add the occurrence number to each class
+ instance_count[unique] += counts
+ else:
+ # background is the last index
+ instance_count[-1] += 1
+ # create a table with category count
+ table_data = [['category', 'count'] * 5]
+ row_data = []
+ for cls, count in enumerate(instance_count):
+ if cls < len(self.CLASSES):
+ row_data += [f'{cls} [{self.CLASSES[cls]}]', f'{count}']
+ else:
+ # add the background number
+ row_data += ['-1 background', f'{count}']
+ if len(row_data) == 10:
+ table_data.append(row_data)
+ row_data = []
+
+ table = AsciiTable(table_data)
+ result += table.table
+ return result
diff --git a/mmcv/datasets/custom_3d.py b/mmcv/datasets/custom_3d.py
new file mode 100644
index 0000000..88c8bfb
--- /dev/null
+++ b/mmcv/datasets/custom_3d.py
@@ -0,0 +1,370 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import tempfile
+import warnings
+from os import path as osp
+from torch.utils.data import Dataset
+
+from mmcv.datasets.builder import DATASETS
+from ..core.bbox import get_box_type
+from .pipelines import Compose
+from .utils import extract_result_dict, get_loading_pipeline
+from mmcv.fileio.io import load, dump
+from mmcv.fileio.parse import list_from_file
+
+@DATASETS.register_module()
+class Custom3DDataset(Dataset):
+ """Customized 3D dataset.
+
+ This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+ dataset.
+
+ Args:
+ data_root (str): Path of dataset root.
+ ann_file (str): Path of annotation file.
+ pipeline (list[dict], optional): Pipeline used for data processing.
+ Defaults to None.
+ classes (tuple[str], optional): Classes used in the dataset.
+ Defaults to None.
+ modality (dict, optional): Modality to specify the sensor data used
+ as input. Defaults to None.
+ box_type_3d (str, optional): Type of 3D box of this dataset.
+ Based on the `box_type_3d`, the dataset will encapsulate the box
+ to its original format then converted them to `box_type_3d`.
+ Defaults to 'LiDAR'. Available options includes
+
+ - 'LiDAR': Box in LiDAR coordinates.
+ - 'Depth': Box in depth coordinates, usually for indoor dataset.
+ - 'Camera': Box in camera coordinates.
+ filter_empty_gt (bool, optional): Whether to filter empty GT.
+ Defaults to True.
+ test_mode (bool, optional): Whether the dataset is in test mode.
+ Defaults to False.
+ """
+
+ def __init__(self,
+ data_root,
+ ann_file,
+ pipeline=None,
+ classes=None,
+ modality=None,
+ box_type_3d='LiDAR',
+ filter_empty_gt=True,
+ test_mode=False):
+ super().__init__()
+ self.data_root = data_root
+ self.ann_file = ann_file
+ self.test_mode = test_mode
+ self.modality = modality
+ self.filter_empty_gt = filter_empty_gt
+ self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+
+ self.CLASSES = self.get_classes(classes)
+ self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+ self.data_infos = self.load_annotations(self.ann_file)
+
+ if pipeline is not None:
+ self.pipeline = Compose(pipeline)
+
+ # set group flag for the sampler
+ if not self.test_mode:
+ self._set_group_flag()
+
+ def load_annotations(self, ann_file):
+ """Load annotations from ann_file.
+
+ Args:
+ ann_file (str): Path of the annotation file.
+
+ Returns:
+ list[dict]: List of annotations.
+ """
+ return load(ann_file)
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - file_name (str): Filename of point clouds.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+ sample_idx = info['point_cloud']['lidar_idx']
+ pts_filename = osp.join(self.data_root, info['pts_path'])
+
+ input_dict = dict(
+ pts_filename=pts_filename,
+ sample_idx=sample_idx,
+ file_name=pts_filename)
+
+ if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+ if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
+ return None
+ return input_dict
+
+ def pre_pipeline(self, results):
+ """Initialization before data preparation.
+
+ Args:
+ results (dict): Dict before data preprocessing.
+
+ - img_fields (list): Image fields.
+ - bbox3d_fields (list): 3D bounding boxes fields.
+ - pts_mask_fields (list): Mask fields of points.
+ - pts_seg_fields (list): Mask fields of point segments.
+ - bbox_fields (list): Fields of bounding boxes.
+ - mask_fields (list): Fields of masks.
+ - seg_fields (list): Segment fields.
+ - box_type_3d (str): 3D box type.
+ - box_mode_3d (str): 3D box mode.
+ """
+ results['img_fields'] = []
+ results['bbox3d_fields'] = []
+ results['pts_mask_fields'] = []
+ results['pts_seg_fields'] = []
+ results['bbox_fields'] = []
+ results['mask_fields'] = []
+ results['seg_fields'] = []
+ results['box_type_3d'] = self.box_type_3d
+ results['box_mode_3d'] = self.box_mode_3d
+
+ def prepare_train_data(self, index):
+ """Training data preparation.
+
+ Args:
+ index (int): Index for accessing the target data.
+
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ input_dict = self.get_data_info(index)
+ if input_dict is None:
+ return None
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.filter_empty_gt and \
+ (example is None or
+ ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ return example
+
+ def prepare_test_data(self, index):
+ """Prepare data for testing.
+
+ Args:
+ index (int): Index for accessing the target data.
+
+ Returns:
+ dict: Testing data dict of the corresponding index.
+ """
+ input_dict = self.get_data_info(index)
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ return example
+
+ @classmethod
+ def get_classes(cls, classes=None):
+ """Get class names of current dataset.
+
+ Args:
+ classes (Sequence[str] | str | None): If classes is None, use
+ default CLASSES defined by builtin dataset. If classes is a
+ string, take it as a file name. The file contains the name of
+ classes where each line contains one class name. If classes is
+ a tuple or list, override the CLASSES defined by the dataset.
+
+ Return:
+ list[str]: A list of class names.
+ """
+ if classes is None:
+ return cls.CLASSES
+
+ if isinstance(classes, str):
+ # take it as a file path
+ class_names = list_from_file(classes)
+ elif isinstance(classes, (tuple, list)):
+ class_names = classes
+ else:
+ raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+ return class_names
+
+ def format_results(self,
+ outputs,
+ pklfile_prefix=None,
+ submission_prefix=None):
+ """Format the results to pkl file.
+
+ Args:
+ outputs (list[dict]): Testing results of the dataset.
+ pklfile_prefix (str | None): The prefix of pkl files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: (outputs, tmp_dir), outputs is the detection results, \
+ tmp_dir is the temporal directory created for saving json \
+ files when ``jsonfile_prefix`` is not specified.
+ """
+ if pklfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ pklfile_prefix = osp.join(tmp_dir.name, 'results')
+ out = f'{pklfile_prefix}.pkl'
+ dump(outputs, out)
+ return outputs, tmp_dir
+
+ def evaluate(self,
+ results,
+ metric=None,
+ iou_thr=(0.25, 0.5),
+ logger=None,
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluate.
+
+ Evaluation in indoor protocol.
+
+ Args:
+ results (list[dict]): List of results.
+ metric (str | list[str]): Metrics to be evaluated.
+ iou_thr (list[float]): AP IoU thresholds.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict: Evaluation results.
+ """
+ from mmcv.core.evaluation import indoor_eval
+ assert isinstance(
+ results, list), f'Expect results to be list, got {type(results)}.'
+ assert len(results) > 0, 'Expect length of results > 0.'
+ assert len(results) == len(self.data_infos)
+ assert isinstance(
+ results[0], dict
+ ), f'Expect elements in results to be dict, got {type(results[0])}.'
+ gt_annos = [info['annos'] for info in self.data_infos]
+ label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
+ ret_dict = indoor_eval(
+ gt_annos,
+ results,
+ iou_thr,
+ label2cat,
+ logger=logger,
+ box_type_3d=self.box_type_3d,
+ box_mode_3d=self.box_mode_3d)
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+
+ return ret_dict
+
+ def _build_default_pipeline(self):
+ """Build the default pipeline for this dataset."""
+ raise NotImplementedError('_build_default_pipeline is not implemented '
+ f'for dataset {self.__class__.__name__}')
+
+ def _get_pipeline(self, pipeline):
+ """Get data loading pipeline in self.show/evaluate function.
+
+ Args:
+ pipeline (list[dict] | None): Input pipeline. If None is given, \
+ get from self.pipeline.
+ """
+ if pipeline is None:
+ if not hasattr(self, 'pipeline') or self.pipeline is None:
+ warnings.warn(
+ 'Use default pipeline for data loading, this may cause '
+ 'errors when data is on ceph')
+ return self._build_default_pipeline()
+ loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+ return Compose(loading_pipeline)
+ return Compose(pipeline)
+
+ def _extract_data(self, index, pipeline, key, load_annos=False):
+ """Load data using input pipeline and extract data according to key.
+
+ Args:
+ index (int): Index for accessing the target data.
+ pipeline (:obj:`Compose`): Composed data loading pipeline.
+ key (str | list[str]): One single or a list of data key.
+ load_annos (bool): Whether to load data annotations.
+ If True, need to set self.test_mode as False before loading.
+
+ Returns:
+ np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+ A single or a list of loaded data.
+ """
+ assert pipeline is not None, 'data loading pipeline is not provided'
+ # when we want to load ground-truth via pipeline (e.g. bbox, seg mask)
+ # we need to set self.test_mode as False so that we have 'annos'
+ if load_annos:
+ original_test_mode = self.test_mode
+ self.test_mode = False
+ input_dict = self.get_data_info(index)
+ self.pre_pipeline(input_dict)
+ example = pipeline(input_dict)
+
+ # extract data items according to keys
+ if isinstance(key, str):
+ data = extract_result_dict(example, key)
+ else:
+ data = [extract_result_dict(example, k) for k in key]
+ if load_annos:
+ self.test_mode = original_test_mode
+
+ return data
+
+ def __len__(self):
+ """Return the length of data infos.
+
+ Returns:
+ int: Length of data infos.
+ """
+ return len(self.data_infos)
+
+ def _rand_another(self, idx):
+ """Randomly get another item with the same flag.
+
+ Returns:
+ int: Another index of item with the same flag.
+ """
+ pool = np.where(self.flag == self.flag[idx])[0]
+ return np.random.choice(pool)
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def _set_group_flag(self):
+ """Set flag according to image aspect ratio.
+
+ Images with aspect ratio greater than 1 will be set as group 1,
+ otherwise group 0. In 3D datasets, they are all the same, thus are all
+ zeros.
+ """
+ self.flag = np.zeros(len(self), dtype=np.uint8)
diff --git a/mmcv/datasets/custom_nuscenes_dataset.py b/mmcv/datasets/custom_nuscenes_dataset.py
new file mode 100644
index 0000000..17c9e5a
--- /dev/null
+++ b/mmcv/datasets/custom_nuscenes_dataset.py
@@ -0,0 +1,246 @@
+import copy
+
+import numpy as np
+from mmcv.datasets import DATASETS
+from mmcv.datasets import NuScenesDataset
+from os import path as osp
+from mmcv.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from mmcv.fileio.io import load
+
+
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+ r"""NuScenes Dataset.
+
+ This datset only add camera intrinsics and extrinsics to the results.
+ """
+
+ def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.queue_length = queue_length
+ self.overlap_test = overlap_test
+ self.bev_size = bev_size
+
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ queue = []
+ index_list = list(range(index-self.queue_length, index))
+ random.shuffle(index_list)
+ index_list = sorted(index_list[1:])
+ index_list.append(index)
+ for i in index_list:
+ i = max(0, i)
+ input_dict = self.get_data_info(i)
+ if input_dict is None:
+ return None
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ queue.append(example)
+ return self.union2one(queue)
+
+
+ def union2one(self, queue):
+ imgs_list = [each['img'].data for each in queue]
+ metas_map = {}
+ prev_scene_token = None
+ prev_pos = None
+ prev_angle = None
+ for i, each in enumerate(queue):
+ metas_map[i] = each['img_metas'].data
+ if metas_map[i]['scene_token'] != prev_scene_token:
+ metas_map[i]['prev_bev_exists'] = False
+ prev_scene_token = metas_map[i]['scene_token']
+ prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] = 0
+ metas_map[i]['can_bus'][-1] = 0
+ else:
+ metas_map[i]['prev_bev_exists'] = True
+ tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] -= prev_pos
+ metas_map[i]['can_bus'][-1] -= prev_angle
+ prev_pos = copy.deepcopy(tmp_pos)
+ prev_angle = copy.deepcopy(tmp_angle)
+ queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+ queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[-1]
+ return queue
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+ # standard protocal modified from SECOND.Pytorch
+ input_dict = dict(
+ sample_idx=info['token'],
+ pts_filename=info['lidar_path'],
+ sweeps=info['sweeps'],
+ ego2global_translation=info['ego2global_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ prev_idx=info['prev'],
+ next_idx=info['next'],
+ scene_token=info['scene_token'],
+ can_bus=info['can_bus'],
+ frame_idx=info['frame_idx'],
+ timestamp=info['timestamp'] / 1e6,
+ )
+#(['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'])
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ for cam_type, cam_info in info['cams'].items():
+ # if cam_type in ['CAM_FRONT','CAM_BACK_LEFT']:
+ # continue
+ image_paths.append(cam_info['data_path'])
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+ lidar2cam_t = cam_info[
+ 'sensor2lidar_translation'] @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ intrinsic = cam_info['cam_intrinsic']
+ viewpad = np.eye(4)
+ viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ lidar2img_rts.append(lidar2img_rt)
+
+ cam_intrinsics.append(viewpad)
+ lidar2cam_rts.append(lidar2cam_rt.T)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam_intrinsic=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ ))
+
+ if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+
+ rotation = Quaternion(input_dict['ego2global_rotation'])
+ translation = input_dict['ego2global_translation']
+ can_bus = input_dict['can_bus']
+ can_bus[:3] = translation
+ can_bus[3:7] = rotation
+ patch_angle = quaternion_yaw(rotation) / np.pi * 180
+ if patch_angle < 0:
+ patch_angle += 360
+ can_bus[-2] = patch_angle / 180 * np.pi
+ can_bus[-1] = patch_angle
+
+ return input_dict
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+ from nuscenes import NuScenes
+ self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+ verbose=True)
+
+ import pdb
+ pdb.set_trace()
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+
+ eval_set_map = {
+ 'v1.0-mini': 'mini_val',
+ 'v1.0-trainval': 'val',
+ }
+ self.nusc_eval = NuScenesEval_custom(
+ self.nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=True,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos
+ )
+ self.nusc_eval.main(plot_examples=0, render_curves=False)
+ # record metrics
+ metrics = load(osp.join(output_dir, 'metrics_summary.json'))
+ detail = dict()
+ metric_prefix = f'{result_name}_NuScenes'
+ for name in self.CLASSES:
+ for k, v in metrics['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,
+ self.ErrNameMapping[k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+ return detail
diff --git a/mmcv/datasets/custom_nuscenes_dataset_v2.py b/mmcv/datasets/custom_nuscenes_dataset_v2.py
new file mode 100644
index 0000000..305d6b3
--- /dev/null
+++ b/mmcv/datasets/custom_nuscenes_dataset_v2.py
@@ -0,0 +1,302 @@
+import copy
+from .nuscenes_dataset import NuScenesDataset
+from .dd3d_nuscenes_dataset import DD3DNuscenesDataset
+from os import path as osp
+from mmcv.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from mmcv.parallel import DataContainer as DC
+from collections import defaultdict, OrderedDict
+
+
+@DATASETS.register_module()
+class CustomNuScenesDatasetV2(NuScenesDataset):
+ def __init__(self, frames=(),mono_cfg=None, overlap_test=False,*args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.frames = frames
+ self.queue_length = len(frames)
+ self.overlap_test = overlap_test
+ self.mono_cfg = mono_cfg
+ if not self.test_mode and mono_cfg is not None:
+ self.mono_dataset = DD3DNuscenesDataset(**mono_cfg)
+
+ def prepare_test_data(self, index):
+ """Prepare data for testing.
+
+ Args:
+ index (int): Index for accessing the target data.
+
+ Returns:
+ dict: Testing data dict of the corresponding index.
+ """
+ data_queue = OrderedDict()
+ input_dict = self.get_data_info(index)
+ cur_scene_token = input_dict['scene_token']
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ data_queue[0] = example
+
+ for frame_idx in self.frames:
+ chosen_idx = index + frame_idx
+ if frame_idx ==0 or chosen_idx <0 or chosen_idx >= len(self.data_infos):
+ continue
+ info = self.data_infos[chosen_idx]
+ input_dict = self.prepare_input_dict(info)
+ if input_dict['scene_token'] == cur_scene_token:
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ data_queue[frame_idx] = example
+
+ data_queue = OrderedDict(sorted(data_queue.items()))
+ ret = defaultdict(list)
+ for i in range(len(data_queue[0]['img'])):
+ single_aug_data_queue = {}
+ for t in data_queue.keys():
+ single_example = {}
+ for key ,value in data_queue[t].items():
+ single_example[key] = value[i]
+ single_aug_data_queue[t] = single_example
+ single_aug_data_queue = OrderedDict(sorted(single_aug_data_queue.items()))
+ single_aug_sample = self.union2one(single_aug_data_queue)
+
+ for key, value in single_aug_sample.items():
+ ret[key].append(value)
+ return ret
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ data_queue = OrderedDict()
+ input_dict = self.get_data_info(index)
+ if input_dict is None:
+ return None
+ cur_scene_token = input_dict['scene_token']
+ # cur_frame_idx = input_dict['frame_idx']
+ ann_info = copy.deepcopy(input_dict['ann_info'])
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ data_queue[0] = example
+ aug_param = copy.deepcopy(example['aug_param']) if 'aug_param' in example else {}
+
+ # frame_idx_to_idx = self.scene_to_frame_idx_to_idx[cur_scene_token]
+ for frame_idx in self.frames:
+ chosen_idx = index + frame_idx
+ if frame_idx ==0 or chosen_idx <0 or chosen_idx >= len(self.data_infos):
+ continue
+ info = self.data_infos[chosen_idx]
+ input_dict = self.prepare_input_dict(info)
+ if input_dict['scene_token'] == cur_scene_token:
+ input_dict['ann_info'] = copy.deepcopy(ann_info) # only for pipeline, should never be used
+ self.pre_pipeline(input_dict)
+ input_dict['aug_param'] = copy.deepcopy(aug_param)
+ example = self.pipeline(input_dict)
+ data_queue[frame_idx] = example
+
+ data_queue = OrderedDict(sorted(data_queue.items()))
+ return self.union2one(data_queue)
+
+ def union2one(self, queue: dict):
+ """
+ convert sample queue into one single sample.
+ """
+ imgs_list = [each['img'].data for each in queue.values()]
+ lidar2ego = np.eye(4, dtype=np.float32)
+ lidar2ego[:3, :3] = Quaternion(queue[0]['lidar2ego_rotation']).rotation_matrix
+ lidar2ego[:3, 3] = queue[0]['lidar2ego_translation']
+
+ egocurr2global = np.eye(4, dtype=np.float32)
+ egocurr2global[:3,:3] = Quaternion(queue[0]['ego2global_rotation']).rotation_matrix
+ egocurr2global[:3,3] = queue[0]['ego2global_translation']
+ metas_map = {}
+ for i, each in queue.items():
+ metas_map[i] = each['img_metas'].data
+ metas_map[i]['timestamp'] = each['timestamp']
+ if 'aug_param' in each:
+ metas_map[i]['aug_param'] = each['aug_param']
+ if i == 0:
+ metas_map[i]['lidaradj2lidarcurr'] = None
+ else:
+ egoadj2global = np.eye(4, dtype=np.float32)
+ egoadj2global[:3,:3] = Quaternion(each['ego2global_rotation']).rotation_matrix
+ egoadj2global[:3,3] = each['ego2global_translation']
+
+ lidaradj2lidarcurr = np.linalg.inv(lidar2ego) @ np.linalg.inv(egocurr2global) @ egoadj2global @ lidar2ego
+ metas_map[i]['lidaradj2lidarcurr'] = lidaradj2lidarcurr
+ for i_cam in range(len(metas_map[i]['lidar2img'])):
+ metas_map[i]['lidar2img'][i_cam] = metas_map[i]['lidar2img'][i_cam] @ np.linalg.inv(lidaradj2lidarcurr)
+ queue[0]['img'] = DC(torch.stack(imgs_list),
+ cpu_only=False, stack=True)
+ queue[0]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[0]
+ return queue
+
+ def prepare_input_dict(self, info):
+ # standard protocal modified from SECOND.Pytorch
+ input_dict = dict(
+ sample_idx=info['token'],
+ pts_filename=info['lidar_path'],
+ sweeps=info['sweeps'],
+ ego2global_translation=info['ego2global_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ lidar2ego_translation=info['lidar2ego_translation'],
+ lidar2ego_rotation=info['lidar2ego_rotation'],
+ prev=info['prev'],
+ next=info['next'],
+ scene_token=info['scene_token'],
+ frame_idx=info['frame_idx'],
+ timestamp=info['timestamp'] / 1e6,
+ )
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ for cam_type, cam_info in info['cams'].items():
+ image_paths.append(cam_info['data_path'])
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+ lidar2cam_t = cam_info[
+ 'sensor2lidar_translation'] @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ intrinsic = cam_info['cam_intrinsic']
+ viewpad = np.eye(4)
+ viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ lidar2img_rts.append(lidar2img_rt)
+
+ cam_intrinsics.append(viewpad)
+ lidar2cam_rts.append(lidar2cam_rt.T)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam2img=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ ))
+
+ return input_dict
+
+ def filter_crowd_annotations(self, data_dict):
+ for ann in data_dict["annotations"]:
+ if ann.get("iscrowd", 0) == 0:
+ return True
+ return False
+
+ def get_data_info(self, index):
+ info = self.data_infos[index]
+ input_dict = self.prepare_input_dict(info)
+ if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+
+ if not self.test_mode and self.mono_cfg is not None:
+ if input_dict is None:
+ return None
+ info = self.data_infos[index]
+ img_ids = []
+ for cam_type, cam_info in info['cams'].items():
+ img_ids.append(cam_info['sample_data_token'])
+
+ mono_input_dict = []; mono_ann_index = []
+ for i, img_id in enumerate(img_ids):
+ tmp_dict = self.mono_dataset.getitem_by_datumtoken(img_id)
+ if tmp_dict is not None:
+ if self.filter_crowd_annotations(tmp_dict):
+ mono_input_dict.append(tmp_dict)
+ mono_ann_index.append(i)
+
+ # filter empth annotation
+ if len(mono_ann_index) == 0:
+ return None
+
+ mono_ann_index = DC(mono_ann_index, cpu_only=True)
+ input_dict['mono_input_dict'] = mono_input_dict
+ input_dict['mono_ann_idx'] = mono_ann_index
+ return input_dict
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+ from nuscenes import NuScenes
+ self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+ verbose=True)
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+
+ eval_set_map = {
+ 'v1.0-mini': 'mini_val',
+ 'v1.0-trainval': 'val',
+ }
+ self.nusc_eval = NuScenesEval_custom(
+ self.nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=True,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos
+ )
+ self.nusc_eval.main(plot_examples=0, render_curves=False)
+ # record metrics
+ metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+ detail = dict()
+ metric_prefix = f'{result_name}_NuScenes'
+ for name in self.CLASSES:
+ for k, v in metrics['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,
+ self.ErrNameMapping[k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+ return detail
\ No newline at end of file
diff --git a/mmcv/datasets/data_utils/data_utils.py b/mmcv/datasets/data_utils/data_utils.py
new file mode 100644
index 0000000..331e02f
--- /dev/null
+++ b/mmcv/datasets/data_utils/data_utils.py
@@ -0,0 +1,174 @@
+import math
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from nuscenes.utils.data_classes import Box as NuScenesBox
+import pyquaternion
+
+
+def output_to_nusc_box(detection):
+ """Convert the output to the box class in the nuScenes.
+ Args:
+ detection (dict): Detection results.
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+ Returns:
+ list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+ """
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+ if 'track_ids' in detection:
+ ids = detection['track_ids'].numpy()
+ else:
+ ids = np.ones_like(labels)
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+ # TODO: check whether this is necessary
+ # with dir_offset & dir_limit in the head
+ box_yaw = -box_yaw - np.pi / 2
+
+ box_list = []
+ for i in range(len(box3d)):
+ quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ velocity = (*box3d.tensor[i, 7:9], 0.0)
+ # velo_val = np.linalg.norm(box3d[i, 7:9])
+ # velo_ori = box3d[i, 6]
+ # velocity = (
+ # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+ box = NuScenesBox(
+ box_gravity_center[i],
+ box_dims[i],
+ quat,
+ label=labels[i],
+ score=scores[i],
+ velocity=velocity)
+ box.token = ids[i]
+ box_list.append(box)
+ return box_list
+
+
+def output_to_nusc_box_det(detection):
+ """Convert the output to the box class in the nuScenes.
+
+ Args:
+ detection (dict): Detection results.
+
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+
+ Returns:
+ list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+ """
+ if 'boxes_3d_det' in detection:
+ box3d = detection['boxes_3d_det']
+ scores = detection['scores_3d_det'].numpy()
+ labels = detection['labels_3d_det'].numpy()
+ else:
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+ # TODO: check whether this is necessary
+ # with dir_offset & dir_limit in the head
+ box_yaw = -box_yaw - np.pi / 2
+
+ box_list = []
+ for i in range(len(box3d)):
+ quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ velocity = (*box3d.tensor[i, 7:9], 0.0)
+ box = NuScenesBox(
+ box_gravity_center[i],
+ box_dims[i],
+ quat,
+ label=labels[i],
+ score=scores[i],
+ velocity=velocity)
+ box_list.append(box)
+ return box_list
+
+
+def lidar_nusc_box_to_global(info,
+ boxes,
+ classes,
+ eval_configs,
+ eval_version='detection_cvpr_2019'):
+ """Convert the box from ego to global coordinate.
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ classes (list[str]): Mapped classes in the evaluation.
+ eval_configs (object): Evaluation configuration object.
+ eval_version (str, optional): Evaluation version.
+ Default: 'detection_cvpr_2019'
+ Returns:
+ list: List of standard NuScenesBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ keep_idx = []
+ for i, box in enumerate(boxes):
+ # Move box to ego vehicle coord system
+ box.rotate(Quaternion(info['lidar2ego_rotation']))
+ box.translate(np.array(info['lidar2ego_translation']))
+ # filter det in ego.
+ cls_range_map = eval_configs.class_range
+ radius = np.linalg.norm(box.center[:2], 2)
+ det_range = cls_range_map[classes[box.label]]
+ if radius > det_range:
+ continue
+ # Move box to global coord system
+ box.rotate(Quaternion(info['ego2global_rotation']))
+ box.translate(np.array(info['ego2global_translation']))
+ box_list.append(box)
+ keep_idx.append(i)
+ return box_list, keep_idx
+
+
+def obtain_map_info(nusc,
+ nusc_maps,
+ sample,
+ patch_size=(102.4, 102.4),
+ canvas_size=(256, 256),
+ layer_names=['lane_divider', 'road_divider'],
+ thickness=10):
+ """
+ Export 2d annotation from the info file and raw data.
+ """
+ l2e_r = sample['lidar2ego_rotation']
+ l2e_t = sample['lidar2ego_translation']
+ e2g_r = sample['ego2global_rotation']
+ e2g_t = sample['ego2global_translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ scene = nusc.get('scene', sample['scene_token'])
+ log = nusc.get('log', scene['log_token'])
+ nusc_map = nusc_maps[log['location']]
+ if layer_names is None:
+ layer_names = nusc_map.non_geometric_layers
+
+ l2g_r_mat = (l2e_r_mat.T @ e2g_r_mat.T).T
+ l2g_t = l2e_t @ e2g_r_mat.T + e2g_t
+ patch_box = (l2g_t[0], l2g_t[1], patch_size[0], patch_size[1])
+ patch_angle = math.degrees(Quaternion(matrix=l2g_r_mat).yaw_pitch_roll[0])
+
+ map_mask = nusc_map.get_map_mask(
+ patch_box, patch_angle, layer_names, canvas_size=canvas_size)
+ map_mask = map_mask[-2] | map_mask[-1]
+ map_mask = map_mask[np.newaxis, :]
+ map_mask = map_mask.transpose((2, 1, 0)).squeeze(2) # (H, W, C)
+
+ erode = nusc_map.get_map_mask(patch_box, patch_angle, [
+ 'drivable_area'], canvas_size=canvas_size)
+ erode = erode.transpose((2, 1, 0)).squeeze(2)
+
+ map_mask = np.concatenate([erode[None], map_mask[None]], axis=0)
+ return map_mask
diff --git a/mmcv/datasets/data_utils/rasterize.py b/mmcv/datasets/data_utils/rasterize.py
new file mode 100644
index 0000000..c30a870
--- /dev/null
+++ b/mmcv/datasets/data_utils/rasterize.py
@@ -0,0 +1,160 @@
+import cv2
+import numpy as np
+from shapely import affinity
+from shapely.geometry import LineString, box
+
+
+def get_patch_coord(patch_box, patch_angle=0.0):
+ patch_x, patch_y, patch_h, patch_w = patch_box
+
+ x_min = patch_x - patch_w / 2.0
+ y_min = patch_y - patch_h / 2.0
+ x_max = patch_x + patch_w / 2.0
+ y_max = patch_y + patch_h / 2.0
+
+ patch = box(x_min, y_min, x_max, y_max)
+ patch = affinity.rotate(patch, patch_angle, origin=(
+ patch_x, patch_y), use_radians=False)
+
+ return patch
+
+
+def get_discrete_degree(vec, angle_class=36):
+ deg = np.mod(np.degrees(np.arctan2(vec[1], vec[0])), 360)
+ deg = (int(deg / (360 / angle_class) + 0.5) % angle_class) + 1
+ return deg
+
+
+def mask_for_lines(lines, mask, thickness, idx, type='index', angle_class=36):
+ coords = np.asarray(list(lines.coords), np.int32)
+ coords = coords.reshape((-1, 2))
+ if len(coords) < 2:
+ return mask, idx
+ if type == 'backward':
+ coords = np.flip(coords, 0)
+
+ if type == 'index':
+ cv2.polylines(mask, [coords], False, color=idx, thickness=thickness)
+ idx += 1
+ else:
+ for i in range(len(coords) - 1):
+ cv2.polylines(mask, [coords[i:]], False, color=get_discrete_degree(
+ coords[i + 1] - coords[i], angle_class=angle_class), thickness=thickness)
+ return mask, idx
+
+
+def line_geom_to_mask(layer_geom, confidence_levels, local_box, canvas_size, thickness, idx, type='index', angle_class=36):
+ patch_x, patch_y, patch_h, patch_w = local_box
+
+ patch = get_patch_coord(local_box)
+
+ canvas_h = canvas_size[0]
+ canvas_w = canvas_size[1]
+ scale_height = canvas_h / patch_h
+ scale_width = canvas_w / patch_w
+
+ trans_x = -patch_x + patch_w / 2.0
+ trans_y = -patch_y + patch_h / 2.0
+
+ map_mask = np.zeros(canvas_size, np.uint8)
+
+ for line in layer_geom:
+ if isinstance(line, tuple):
+ line, confidence = line
+ else:
+ confidence = None
+ new_line = line.intersection(patch)
+ if not new_line.is_empty:
+ new_line = affinity.affine_transform(
+ new_line, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
+ new_line = affinity.scale(
+ new_line, xfact=scale_width, yfact=scale_height, origin=(0, 0))
+ confidence_levels.append(confidence)
+ if new_line.geom_type == 'MultiLineString':
+ for new_single_line in new_line:
+ map_mask, idx = mask_for_lines(
+ new_single_line, map_mask, thickness, idx, type, angle_class)
+ else:
+ map_mask, idx = mask_for_lines(
+ new_line, map_mask, thickness, idx, type, angle_class)
+ return map_mask, idx
+
+
+def overlap_filter(mask, filter_mask):
+ C, _, _ = mask.shape
+ for c in range(C-1, -1, -1):
+ filter = np.repeat((filter_mask[c] != 0)[None, :], c, axis=0)
+ mask[:c][filter] = 0
+
+ return mask
+
+
+def preprocess_map(vectors, patch_size, canvas_size, num_classes, thickness, angle_class):
+ confidence_levels = [-1]
+ vector_num_list = {}
+ for i in range(num_classes):
+ vector_num_list[i] = []
+
+ for vector in vectors:
+ if vector['pts_num'] >= 2:
+ vector_num_list[vector['type']].append(
+ LineString(vector['pts'][:vector['pts_num']]))
+
+ local_box = (0.0, 0.0, patch_size[0], patch_size[1])
+
+ idx = 1
+ filter_masks = []
+ instance_masks = []
+ forward_masks = []
+ backward_masks = []
+ for i in range(num_classes):
+ map_mask, idx = line_geom_to_mask(
+ vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, idx)
+ instance_masks.append(map_mask)
+ filter_mask, _ = line_geom_to_mask(
+ vector_num_list[i], confidence_levels, local_box, canvas_size, thickness + 4, 1)
+ filter_masks.append(filter_mask)
+ forward_mask, _ = line_geom_to_mask(
+ vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, 1, type='forward', angle_class=angle_class)
+ forward_masks.append(forward_mask)
+ backward_mask, _ = line_geom_to_mask(
+ vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, 1, type='backward', angle_class=angle_class)
+ backward_masks.append(backward_mask)
+
+ filter_masks = np.stack(filter_masks)
+ instance_masks = np.stack(instance_masks)
+ forward_masks = np.stack(forward_masks)
+ backward_masks = np.stack(backward_masks)
+
+ instance_masks = overlap_filter(instance_masks, filter_masks)
+ forward_masks = overlap_filter(
+ forward_masks, filter_masks).sum(0).astype('int32')
+ backward_masks = overlap_filter(
+ backward_masks, filter_masks).sum(0).astype('int32')
+
+ semantic_masks = instance_masks != 0
+
+ return semantic_masks, instance_masks, forward_masks, backward_masks
+
+
+def rasterize_map(vectors, patch_size, canvas_size, num_classes, thickness):
+ confidence_levels = [-1]
+ vector_num_list = {}
+ for i in range(num_classes):
+ vector_num_list[i] = []
+
+ for vector in vectors:
+ if vector['pts_num'] >= 2:
+ vector_num_list[vector['type']].append(
+ (LineString(vector['pts'][:vector['pts_num']]), vector.get('confidence_level', 1)))
+
+ local_box = (0.0, 0.0, patch_size[0], patch_size[1])
+
+ idx = 1
+ masks = []
+ for i in range(num_classes):
+ map_mask, idx = line_geom_to_mask(
+ vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, idx)
+ masks.append(map_mask)
+
+ return np.stack(masks), confidence_levels
diff --git a/mmcv/datasets/data_utils/trajectory_api.py b/mmcv/datasets/data_utils/trajectory_api.py
new file mode 100644
index 0000000..83b2c3b
--- /dev/null
+++ b/mmcv/datasets/data_utils/trajectory_api.py
@@ -0,0 +1,283 @@
+import numpy as np
+from nuscenes.prediction import (PredictHelper,
+ convert_local_coords_to_global,
+ convert_global_coords_to_local)
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmcv.parallel import DataContainer as DC
+from mmcv.datasets.pipelines import to_tensor
+
+class NuScenesTraj(object):
+ def __init__(self,
+ nusc,
+ predict_steps,
+ planning_steps,
+ past_steps,
+ fut_steps,
+ with_velocity,
+ CLASSES,
+ box_mode_3d,
+ use_nonlinear_optimizer=False):
+ super().__init__()
+ self.nusc = nusc
+ self.prepare_sdc_vel_info()
+ self.predict_steps = predict_steps
+ self.planning_steps = planning_steps
+ self.past_steps = past_steps
+ self.fut_steps = fut_steps
+ self.with_velocity = with_velocity
+ self.CLASSES = CLASSES
+ self.box_mode_3d = box_mode_3d
+ self.predict_helper = PredictHelper(self.nusc)
+ self.use_nonlinear_optimizer = use_nonlinear_optimizer
+
+ def get_traj_label(self, sample_token, ann_tokens):
+ sd_rec = self.nusc.get('sample', sample_token)
+ fut_traj_all = []
+ fut_traj_valid_mask_all = []
+ past_traj_all = []
+ past_traj_valid_mask_all = []
+ _, boxes, _ = self.nusc.get_sample_data(sd_rec['data']['LIDAR_TOP'], selected_anntokens=ann_tokens)
+ for i, ann_token in enumerate(ann_tokens):
+ box = boxes[i]
+ instance_token = self.nusc.get('sample_annotation', ann_token)['instance_token']
+ fut_traj_local = self.predict_helper.get_future_for_agent(instance_token, sample_token, seconds=6, in_agent_frame=True)
+ past_traj_local = self.predict_helper.get_past_for_agent(instance_token, sample_token, seconds=2, in_agent_frame=True)
+
+ fut_traj = np.zeros((self.predict_steps, 2))
+ fut_traj_valid_mask = np.zeros((self.predict_steps, 2))
+ past_traj = np.zeros((self.past_steps + self.fut_steps, 2))
+ past_traj_valid_mask = np.zeros((self.past_steps + self.fut_steps, 2))
+ if fut_traj_local.shape[0] > 0:
+ if self.use_nonlinear_optimizer:
+ trans = box.center
+ else:
+ trans = np.array([0, 0, 0])
+ rot = Quaternion(matrix=box.rotation_matrix)
+ fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot)
+ fut_traj[:fut_traj_scence_centric.shape[0], :] = fut_traj_scence_centric
+ fut_traj_valid_mask[:fut_traj_scence_centric.shape[0], :] = 1
+ if past_traj_local.shape[0] > 0:
+ trans = np.array([0, 0, 0])
+ rot = Quaternion(matrix=box.rotation_matrix)
+ past_traj_scence_centric = convert_local_coords_to_global(past_traj_local, trans, rot)
+ past_traj[:past_traj_scence_centric.shape[0], :] = past_traj_scence_centric
+ past_traj_valid_mask[:past_traj_scence_centric.shape[0], :] = 1
+
+ if fut_traj_local.shape[0] > 0:
+ fut_steps = min(self.fut_steps, fut_traj_scence_centric.shape[0])
+ past_traj[self.past_steps:self.past_steps+fut_steps, :] = fut_traj_scence_centric[:fut_steps]
+ past_traj_valid_mask[self.past_steps:self.past_steps+fut_steps, :] = 1
+
+ fut_traj_all.append(fut_traj)
+ fut_traj_valid_mask_all.append(fut_traj_valid_mask)
+ past_traj_all.append(past_traj)
+ past_traj_valid_mask_all.append(past_traj_valid_mask)
+ if len(ann_tokens) > 0:
+ fut_traj_all = np.stack(fut_traj_all, axis=0)
+ fut_traj_valid_mask_all = np.stack(fut_traj_valid_mask_all, axis=0)
+ past_traj_all = np.stack(past_traj_all, axis=0)
+ past_traj_valid_mask_all = np.stack(past_traj_valid_mask_all, axis=0)
+ else:
+ fut_traj_all = np.zeros((0, self.predict_steps, 2))
+ fut_traj_valid_mask_all = np.zeros((0, self.predict_steps, 2))
+ past_traj_all = np.zeros((0, self.predict_steps, 2))
+ past_traj_valid_mask_all = np.zeros((0, self.predict_steps, 2))
+ return fut_traj_all, fut_traj_valid_mask_all, past_traj_all, past_traj_valid_mask_all
+
+ def get_vel_transform_mats(self, sample):
+ sd_rec = self.nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ cs_record = self.nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = self.nusc.get('ego_pose', sd_rec['ego_pose_token'])
+
+ l2e_r = cs_record['rotation']
+ l2e_t = cs_record['translation']
+ e2g_r = pose_record['rotation']
+ e2g_t = pose_record['translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ return l2e_r_mat, e2g_r_mat
+
+ def get_vel_and_time(self, sample):
+ lidar_token = sample['data']['LIDAR_TOP']
+ lidar_top = self.nusc.get('sample_data', lidar_token)
+ pose = self.nusc.get('ego_pose', lidar_top['ego_pose_token'])
+ xyz = pose['translation']
+ timestamp = sample['timestamp']
+ return xyz, timestamp
+
+ def prepare_sdc_vel_info(self):
+ # generate sdc velocity info for all samples
+ # Note that these velocity values are converted from
+ # global frame to lidar frame
+ # as aligned with bbox gts
+
+ self.sdc_vel_info = {}
+ for scene in self.nusc.scene:
+ scene_token = scene['token']
+
+ # we cannot infer vel for the last sample, therefore we skip it
+ last_sample_token = scene['last_sample_token']
+ sample_token = scene['first_sample_token']
+ sample = self.nusc.get('sample', sample_token)
+ xyz, time = self.get_vel_and_time(sample)
+ while sample['token'] != last_sample_token:
+ next_sample_token = sample['next']
+ next_sample = self.nusc.get('sample', next_sample_token)
+ next_xyz, next_time = self.get_vel_and_time(next_sample)
+ dc = np.array(next_xyz) - np.array(xyz)
+ dt = (next_time - time) / 1e6
+ vel = dc/dt
+
+ # global frame to lidar frame
+ l2e_r_mat, e2g_r_mat = self.get_vel_transform_mats(sample)
+ vel = vel @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+ l2e_r_mat).T
+ vel = vel[:2]
+
+ self.sdc_vel_info[sample['token']] = vel
+ xyz, time = next_xyz, next_time
+ sample = next_sample
+
+ # set first sample's vel equal to second sample's
+ last_sample = self.nusc.get('sample', last_sample_token)
+ second_last_sample_token = last_sample['prev']
+ self.sdc_vel_info[last_sample_token] = self.sdc_vel_info[second_last_sample_token]
+
+ def generate_sdc_info(self, sdc_vel, as_lidar_instance3d_box=False):
+ # sdc dim from https://forum.nuscenes.org/t/dimensions-of-the-ego-vehicle-used-to-gather-data/550
+ psudo_sdc_bbox = np.array([0.0, 0.0, 0.0, 1.73, 4.08, 1.56, -np.pi])
+ if self.with_velocity:
+ psudo_sdc_bbox = np.concatenate([psudo_sdc_bbox, sdc_vel], axis=-1)
+ gt_bboxes_3d = np.array([psudo_sdc_bbox]).astype(np.float32)
+ gt_names_3d = ['car']
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ if as_lidar_instance3d_box:
+ # if we do not want the batch the box in to DataContrainer
+ return gt_bboxes_3d
+
+ gt_labels_3d = DC(to_tensor(gt_labels_3d))
+ gt_bboxes_3d = DC(gt_bboxes_3d, cpu_only=True)
+
+ return gt_bboxes_3d, gt_labels_3d
+
+ def get_sdc_traj_label(self, sample_token):
+ sd_rec = self.nusc.get('sample', sample_token)
+ lidar_top_data_start = self.nusc.get('sample_data', sd_rec['data']['LIDAR_TOP'])
+ ego_pose_start = self.nusc.get('ego_pose', lidar_top_data_start['ego_pose_token'])
+
+ sdc_fut_traj = []
+ for _ in range(self.predict_steps):
+ next_annotation_token = sd_rec['next']
+ if next_annotation_token=='':
+ break
+ sd_rec = self.nusc.get('sample', next_annotation_token)
+ lidar_top_data_next = self.nusc.get('sample_data', sd_rec['data']['LIDAR_TOP'])
+ ego_pose_next = self.nusc.get('ego_pose', lidar_top_data_next['ego_pose_token'])
+ sdc_fut_traj.append(ego_pose_next['translation'][:2]) # global xy pos of sdc at future step i
+
+ sdc_fut_traj_all = np.zeros((1, self.predict_steps, 2))
+ sdc_fut_traj_valid_mask_all = np.zeros((1, self.predict_steps, 2))
+ n_valid_timestep = len(sdc_fut_traj)
+ if n_valid_timestep>0:
+ sdc_fut_traj = np.stack(sdc_fut_traj, axis=0) #(t,2)
+ sdc_fut_traj = convert_global_coords_to_local(
+ coordinates=sdc_fut_traj,
+ translation=ego_pose_start['translation'],
+ rotation=ego_pose_start['rotation'],
+ )
+ sdc_fut_traj_all[:,:n_valid_timestep,:] = sdc_fut_traj
+ sdc_fut_traj_valid_mask_all[:,:n_valid_timestep,:] = 1
+
+ return sdc_fut_traj_all, sdc_fut_traj_valid_mask_all
+
+ def get_l2g_transform(self, sample):
+ sd_rec = self.nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+ cs_record = self.nusc.get('calibrated_sensor',
+ sd_rec['calibrated_sensor_token'])
+ pose_record = self.nusc.get('ego_pose', sd_rec['ego_pose_token'])
+
+ l2e_r = cs_record['rotation']
+ l2e_t = np.array(cs_record['translation'])
+ e2g_r = pose_record['rotation']
+ e2g_t = np.array(pose_record['translation'])
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ return l2e_r_mat, l2e_t, e2g_r_mat, e2g_t
+
+ def get_sdc_planning_label(self, sample_token):
+ sd_rec = self.nusc.get('sample', sample_token)
+ l2e_r_mat_init, l2e_t_init, e2g_r_mat_init, e2g_t_init = self.get_l2g_transform(sd_rec)
+
+
+ planning = []
+ for _ in range(self.planning_steps):
+ next_annotation_token = sd_rec['next']
+ if next_annotation_token=='':
+ break
+ sd_rec = self.nusc.get('sample', next_annotation_token)
+ l2e_r_mat_curr, l2e_t_curr, e2g_r_mat_curr, e2g_t_curr = self.get_l2g_transform(sd_rec) # (lidar to global at current frame)
+
+ # bbox of sdc under current lidar frame
+ next_bbox3d = self.generate_sdc_info(self.sdc_vel_info[next_annotation_token], as_lidar_instance3d_box=True)
+
+ # to bbox under curr ego frame
+ next_bbox3d.rotate(l2e_r_mat_curr.T)
+ next_bbox3d.translate(l2e_t_curr)
+
+ # to bbox under world frame
+ next_bbox3d.rotate(e2g_r_mat_curr.T)
+ next_bbox3d.translate(e2g_t_curr)
+
+ # to bbox under initial ego frame, first inverse translate, then inverse rotate
+ next_bbox3d.translate(- e2g_t_init)
+ m1 = np.linalg.inv(e2g_r_mat_init)
+ next_bbox3d.rotate(m1.T)
+
+ # to bbox under curr ego frame, first inverse translate, then inverse rotate
+ next_bbox3d.translate(- l2e_t_init)
+ m2 = np.linalg.inv(l2e_r_mat_init)
+ next_bbox3d.rotate(m2.T)
+
+ planning.append(next_bbox3d)
+
+ planning_all = np.zeros((1, self.planning_steps, 3))
+ planning_mask_all = np.zeros((1, self.planning_steps, 2))
+ n_valid_timestep = len(planning)
+ if n_valid_timestep>0:
+ planning = [p.tensor.squeeze(0) for p in planning]
+ planning = np.stack(planning, axis=0) # (valid_t, 9)
+ planning = planning[:, [0,1,6]] # (x, y, yaw)
+ planning_all[:,:n_valid_timestep,:] = planning
+ planning_mask_all[:,:n_valid_timestep,:] = 1
+
+ mask = planning_mask_all[0].any(axis=1)
+ if mask.sum() == 0:
+ command = 2 #'FORWARD'
+ elif planning_all[0, mask][-1][0] >= 2:
+ command = 0 #'RIGHT'
+ elif planning_all[0, mask][-1][0] <= -2:
+ command = 1 #'LEFT'
+ else:
+ command = 2 #'FORWARD'
+
+ return planning_all, planning_mask_all, command
\ No newline at end of file
diff --git a/mmcv/datasets/data_utils/vector_map.py b/mmcv/datasets/data_utils/vector_map.py
new file mode 100644
index 0000000..1ea21a6
--- /dev/null
+++ b/mmcv/datasets/data_utils/vector_map.py
@@ -0,0 +1,246 @@
+import numpy as np
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+
+CLASS2LABEL = {
+ 'road_divider': 0,
+ 'lane_divider': 0,
+ 'ped_crossing': 1,
+ 'contours': 2,
+ 'others': -1
+}
+
+class VectorizedLocalMap(object):
+ def __init__(self,
+ dataroot,
+ patch_size,
+ canvas_size,
+ line_classes=['road_divider', 'lane_divider'],
+ ped_crossing_classes=['ped_crossing'],
+ contour_classes=['road_segment', 'lane'],
+ sample_dist=1,
+ num_samples=250,
+ padding=False,
+ normalize=False,
+ fixed_num=-1):
+ '''
+ Args:
+ fixed_num = -1 : no fixed num
+ '''
+ super().__init__()
+ self.data_root = dataroot
+ self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
+ 'singapore-onenorth', 'singapore-queenstown']
+ self.line_classes = line_classes
+ self.ped_crossing_classes = ped_crossing_classes
+ self.polygon_classes = contour_classes
+ self.nusc_maps = {}
+ self.map_explorer = {}
+ for loc in self.MAPS:
+ self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc)
+ self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc])
+
+ self.patch_size = patch_size
+ self.canvas_size = canvas_size
+ self.sample_dist = sample_dist
+ self.num_samples = num_samples
+ self.padding = padding
+ self.normalize = normalize
+ self.fixed_num = fixed_num
+
+ def gen_vectorized_samples(self, location, ego2global_translation, ego2global_rotation):
+ map_pose = ego2global_translation[:2]
+ rotation = Quaternion(ego2global_rotation)
+
+ patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+ patch_angle = quaternion_yaw(rotation) / np.pi * 180
+
+ line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location)
+ line_vector_dict = self.line_geoms_to_vectors(line_geom)
+
+ ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location)
+ # ped_vector_list = self.ped_geoms_to_vectors(ped_geom)
+ ped_vector_list = self.line_geoms_to_vectors(ped_geom)['ped_crossing']
+
+ polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location)
+ poly_bound_list = self.poly_geoms_to_vectors(polygon_geom)
+
+ vectors = []
+ for line_type, vects in line_vector_dict.items():
+ for line, length in vects:
+ vectors.append((line.astype(float), length, CLASS2LABEL.get(line_type, -1)))
+
+ for ped_line, length in ped_vector_list:
+ vectors.append((ped_line.astype(float), length, CLASS2LABEL.get('ped_crossing', -1)))
+
+ for contour, length in poly_bound_list:
+ vectors.append((contour.astype(float), length, CLASS2LABEL.get('contours', -1)))
+
+ # filter out -1
+ filtered_vectors = []
+ for pts, pts_num, type in vectors:
+ if type != -1:
+ filtered_vectors.append({
+ 'pts': pts,
+ 'pts_num': pts_num,
+ 'type': type
+ })
+
+ return filtered_vectors
+
+ def get_map_geom(self, patch_box, patch_angle, layer_names, location):
+ map_geom = []
+ for layer_name in layer_names:
+ if layer_name in self.line_classes:
+ geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name)
+ map_geom.append((layer_name, geoms))
+ elif layer_name in self.polygon_classes:
+ geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+ map_geom.append((layer_name, geoms))
+ elif layer_name in self.ped_crossing_classes:
+ geoms = self.get_ped_crossing_line(patch_box, patch_angle, location)
+ # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+ map_geom.append((layer_name, geoms))
+ return map_geom
+
+ def _one_type_line_geom_to_vectors(self, line_geom):
+ line_vectors = []
+ for line in line_geom:
+ if not line.is_empty:
+ if line.geom_type == 'MultiLineString':
+ for single_line in line.geoms:
+ line_vectors.append(self.sample_pts_from_line(single_line))
+ elif line.geom_type == 'LineString':
+ line_vectors.append(self.sample_pts_from_line(line))
+ else:
+ raise NotImplementedError
+ return line_vectors
+
+ def poly_geoms_to_vectors(self, polygon_geom):
+ roads = polygon_geom[0][1]
+ lanes = polygon_geom[1][1]
+ union_roads = ops.unary_union(roads)
+ union_lanes = ops.unary_union(lanes)
+ union_segments = ops.unary_union([union_roads, union_lanes])
+ max_x = self.patch_size[1] / 2
+ max_y = self.patch_size[0] / 2
+ local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+ exteriors = []
+ interiors = []
+ if union_segments.geom_type != 'MultiPolygon':
+ union_segments = MultiPolygon([union_segments])
+ for poly in union_segments.geoms:
+ exteriors.append(poly.exterior)
+ for inter in poly.interiors:
+ interiors.append(inter)
+
+ results = []
+ for ext in exteriors:
+ if ext.is_ccw:
+ ext.coords = list(ext.coords)[::-1]
+ lines = ext.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ for inter in interiors:
+ if not inter.is_ccw:
+ inter.coords = list(inter.coords)[::-1]
+ lines = inter.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ return self._one_type_line_geom_to_vectors(results)
+
+ def line_geoms_to_vectors(self, line_geom):
+ line_vectors_dict = dict()
+ for line_type, a_type_of_lines in line_geom:
+ one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines)
+ line_vectors_dict[line_type] = one_type_vectors
+
+ return line_vectors_dict
+
+ def ped_geoms_to_vectors(self, ped_geom):
+ ped_geom = ped_geom[0][1]
+ union_ped = ops.unary_union(ped_geom)
+ if union_ped.geom_type != 'MultiPolygon':
+ union_ped = MultiPolygon([union_ped])
+
+ max_x = self.patch_size[1] / 2
+ max_y = self.patch_size[0] / 2
+ local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+ results = []
+ for ped_poly in union_ped:
+ # rect = ped_poly.minimum_rotated_rectangle
+ ext = ped_poly.exterior
+ if not ext.is_ccw:
+ ext.coords = list(ext.coords)[::-1]
+ lines = ext.intersection(local_patch)
+ results.append(lines)
+
+ return self._one_type_line_geom_to_vectors(results)
+
+ def get_ped_crossing_line(self, patch_box, patch_angle, location):
+ def add_line(poly_xy, idx, patch, patch_angle, patch_x, patch_y, line_list):
+ points = [(p0, p1) for p0, p1 in zip(poly_xy[0, idx:idx + 2], poly_xy[1, idx:idx + 2])]
+ line = LineString(points)
+ line = line.intersection(patch)
+ if not line.is_empty:
+ line = affinity.rotate(line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
+ line = affinity.affine_transform(line, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ line_list.append(line)
+
+ patch_x = patch_box[0]
+ patch_y = patch_box[1]
+
+ patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
+ line_list = []
+ records = getattr(self.nusc_maps[location], 'ped_crossing')
+ for record in records:
+ polygon = self.map_explorer[location].extract_polygon(record['polygon_token'])
+ poly_xy = np.array(polygon.exterior.xy)
+ dist = np.square(poly_xy[:, 1:] - poly_xy[:, :-1]).sum(0)
+ x1, x2 = np.argsort(dist)[-2:]
+
+ add_line(poly_xy, x1, patch, patch_angle, patch_x, patch_y, line_list)
+ add_line(poly_xy, x2, patch, patch_angle, patch_x, patch_y, line_list)
+
+ return line_list
+
+ def sample_pts_from_line(self, line):
+ if self.fixed_num < 0:
+ distances = np.arange(0, line.length, self.sample_dist)
+ sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ else:
+ # fixed number of points, so distance is line.length / self.fixed_num
+ distances = np.linspace(0, line.length, self.fixed_num)
+ sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+ if self.normalize:
+ sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+
+ num_valid = len(sampled_points)
+
+ if not self.padding or self.fixed_num > 0:
+ # fixed num sample can return now!
+ return sampled_points, num_valid
+
+ # fixed distance sampling need padding!
+ num_valid = len(sampled_points)
+
+ if self.fixed_num < 0:
+ if num_valid < self.num_samples:
+ padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ else:
+ sampled_points = sampled_points[:self.num_samples, :]
+ num_valid = self.num_samples
+
+ if self.normalize:
+ sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+ num_valid = len(sampled_points)
+
+ return sampled_points, num_valid
diff --git a/mmcv/datasets/dataset_wrappers.py b/mmcv/datasets/dataset_wrappers.py
new file mode 100644
index 0000000..65921b7
--- /dev/null
+++ b/mmcv/datasets/dataset_wrappers.py
@@ -0,0 +1,353 @@
+import bisect
+import math
+from collections import defaultdict
+
+import numpy as np
+from mmcv.utils import print_log
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+ """A wrapper of concatenated dataset.
+
+ Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+ concat the group flag for image aspect ratio.
+
+ Args:
+ datasets (list[:obj:`Dataset`]): A list of datasets.
+ separate_eval (bool): Whether to evaluate the results
+ separately if it is used as validation dataset.
+ Defaults to True.
+ """
+
+ def __init__(self, datasets, separate_eval=True):
+ super(ConcatDataset, self).__init__(datasets)
+ self.CLASSES = datasets[0].CLASSES
+ self.separate_eval = separate_eval
+ if not separate_eval:
+ if any([isinstance(ds, CocoDataset) for ds in datasets]):
+ raise NotImplementedError(
+ 'Evaluating concatenated CocoDataset as a whole is not'
+ ' supported! Please set "separate_eval=True"')
+ elif len(set([type(ds) for ds in datasets])) != 1:
+ raise NotImplementedError(
+ 'All the datasets should have same types')
+
+ if hasattr(datasets[0], 'flag'):
+ flags = []
+ for i in range(0, len(datasets)):
+ flags.append(datasets[i].flag)
+ self.flag = np.concatenate(flags)
+
+ def get_cat_ids(self, idx):
+ """Get category ids of concatenated dataset by index.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ list[int]: All categories in the image of specified index.
+ """
+
+ if idx < 0:
+ if -idx > len(self):
+ raise ValueError(
+ 'absolute value of index should not exceed dataset length')
+ idx = len(self) + idx
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+ if dataset_idx == 0:
+ sample_idx = idx
+ else:
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+ return self.datasets[dataset_idx].get_cat_ids(sample_idx)
+
+ def evaluate(self, results, logger=None, **kwargs):
+ """Evaluate the results.
+
+ Args:
+ results (list[list | tuple]): Testing results of the dataset.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+
+ Returns:
+ dict[str: float]: AP results of the total dataset or each separate
+ dataset if `self.separate_eval=True`.
+ """
+ assert len(results) == self.cumulative_sizes[-1], \
+ ('Dataset and results have different sizes: '
+ f'{self.cumulative_sizes[-1]} v.s. {len(results)}')
+
+ # Check whether all the datasets support evaluation
+ for dataset in self.datasets:
+ assert hasattr(dataset, 'evaluate'), \
+ f'{type(dataset)} does not implement evaluate function'
+
+ if self.separate_eval:
+ dataset_idx = -1
+ total_eval_results = dict()
+ for size, dataset in zip(self.cumulative_sizes, self.datasets):
+ start_idx = 0 if dataset_idx == -1 else \
+ self.cumulative_sizes[dataset_idx]
+ end_idx = self.cumulative_sizes[dataset_idx + 1]
+
+ results_per_dataset = results[start_idx:end_idx]
+ print_log(
+ f'\nEvaluateing {dataset.ann_file} with '
+ f'{len(results_per_dataset)} images now',
+ logger=logger)
+
+ eval_results_per_dataset = dataset.evaluate(
+ results_per_dataset, logger=logger, **kwargs)
+ dataset_idx += 1
+ for k, v in eval_results_per_dataset.items():
+ total_eval_results.update({f'{dataset_idx}_{k}': v})
+
+ return total_eval_results
+ elif any([isinstance(ds, CocoDataset) for ds in self.datasets]):
+ raise NotImplementedError(
+ 'Evaluating concatenated CocoDataset as a whole is not'
+ ' supported! Please set "separate_eval=True"')
+ elif len(set([type(ds) for ds in self.datasets])) != 1:
+ raise NotImplementedError(
+ 'All the datasets should have same types')
+ else:
+ original_data_infos = self.datasets[0].data_infos
+ self.datasets[0].data_infos = sum(
+ [dataset.data_infos for dataset in self.datasets], [])
+ eval_results = self.datasets[0].evaluate(
+ results, logger=logger, **kwargs)
+ self.datasets[0].data_infos = original_data_infos
+ return eval_results
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+ """A wrapper of repeated dataset.
+
+ The length of repeated dataset will be `times` larger than the original
+ dataset. This is useful when the data loading time is long but the dataset
+ is small. Using RepeatDataset can reduce the data loading time between
+ epochs.
+
+ Args:
+ dataset (:obj:`Dataset`): The dataset to be repeated.
+ times (int): Repeat times.
+ """
+
+ def __init__(self, dataset, times):
+ self.dataset = dataset
+ self.times = times
+ self.CLASSES = dataset.CLASSES
+ if hasattr(self.dataset, 'flag'):
+ self.flag = np.tile(self.dataset.flag, times)
+
+ self._ori_len = len(self.dataset)
+
+ def __getitem__(self, idx):
+ return self.dataset[idx % self._ori_len]
+
+ def get_cat_ids(self, idx):
+ """Get category ids of repeat dataset by index.
+
+ Args:
+ idx (int): Index of data.
+
+ Returns:
+ list[int]: All categories in the image of specified index.
+ """
+
+ return self.dataset.get_cat_ids(idx % self._ori_len)
+
+ def __len__(self):
+ """Length after repetition."""
+ return self.times * self._ori_len
+
+
+# Modified from https://github.com/facebookresearch/detectron2/blob/41d475b75a230221e21d9cac5d69655e3415e3a4/detectron2/data/samplers/distributed_sampler.py#L57 # noqa
+@DATASETS.register_module()
+class ClassBalancedDataset:
+ """A wrapper of repeated dataset with repeat factor.
+
+ Suitable for training on class imbalanced datasets like LVIS. Following
+ the sampling strategy in the `paper `_,
+ in each epoch, an image may appear multiple times based on its
+ "repeat factor".
+ The repeat factor for an image is a function of the frequency the rarest
+ category labeled in that image. The "frequency of category c" in [0, 1]
+ is defined by the fraction of images in the training set (without repeats)
+ in which category c appears.
+ The dataset needs to instantiate :func:`self.get_cat_ids` to support
+ ClassBalancedDataset.
+
+ The repeat factor is computed as followed.
+
+ 1. For each category c, compute the fraction # of images
+ that contain it: :math:`f(c)`
+ 2. For each category c, compute the category-level repeat factor:
+ :math:`r(c) = max(1, sqrt(t/f(c)))`
+ 3. For each image I, compute the image-level repeat factor:
+ :math:`r(I) = max_{c in I} r(c)`
+
+ Args:
+ dataset (:obj:`CustomDataset`): The dataset to be repeated.
+ oversample_thr (float): frequency threshold below which data is
+ repeated. For categories with ``f_c >= oversample_thr``, there is
+ no oversampling. For categories with ``f_c < oversample_thr``, the
+ degree of oversampling following the square-root inverse frequency
+ heuristic above.
+ filter_empty_gt (bool, optional): If set true, images without bounding
+ boxes will not be oversampled. Otherwise, they will be categorized
+ as the pure background class and involved into the oversampling.
+ Default: True.
+ """
+
+ def __init__(self, dataset, oversample_thr, filter_empty_gt=True):
+ self.dataset = dataset
+ self.oversample_thr = oversample_thr
+ self.filter_empty_gt = filter_empty_gt
+ self.CLASSES = dataset.CLASSES
+
+ repeat_factors = self._get_repeat_factors(dataset, oversample_thr)
+ repeat_indices = []
+ for dataset_idx, repeat_factor in enumerate(repeat_factors):
+ repeat_indices.extend([dataset_idx] * math.ceil(repeat_factor))
+ self.repeat_indices = repeat_indices
+
+ flags = []
+ if hasattr(self.dataset, 'flag'):
+ for flag, repeat_factor in zip(self.dataset.flag, repeat_factors):
+ flags.extend([flag] * int(math.ceil(repeat_factor)))
+ assert len(flags) == len(repeat_indices)
+ self.flag = np.asarray(flags, dtype=np.uint8)
+
+ def _get_repeat_factors(self, dataset, repeat_thr):
+ """Get repeat factor for each images in the dataset.
+
+ Args:
+ dataset (:obj:`CustomDataset`): The dataset
+ repeat_thr (float): The threshold of frequency. If an image
+ contains the categories whose frequency below the threshold,
+ it would be repeated.
+
+ Returns:
+ list[float]: The repeat factors for each images in the dataset.
+ """
+
+ # 1. For each category c, compute the fraction # of images
+ # that contain it: f(c)
+ category_freq = defaultdict(int)
+ num_images = len(dataset)
+ for idx in range(num_images):
+ cat_ids = set(self.dataset.get_cat_ids(idx))
+ if len(cat_ids) == 0 and not self.filter_empty_gt:
+ cat_ids = set([len(self.CLASSES)])
+ for cat_id in cat_ids:
+ category_freq[cat_id] += 1
+ for k, v in category_freq.items():
+ category_freq[k] = v / num_images
+
+ # 2. For each category c, compute the category-level repeat factor:
+ # r(c) = max(1, sqrt(t/f(c)))
+ category_repeat = {
+ cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+ for cat_id, cat_freq in category_freq.items()
+ }
+
+ # 3. For each image I, compute the image-level repeat factor:
+ # r(I) = max_{c in I} r(c)
+ repeat_factors = []
+ for idx in range(num_images):
+ cat_ids = set(self.dataset.get_cat_ids(idx))
+ if len(cat_ids) == 0 and not self.filter_empty_gt:
+ cat_ids = set([len(self.CLASSES)])
+ repeat_factor = 1
+ if len(cat_ids) > 0:
+ repeat_factor = max(
+ {category_repeat[cat_id]
+ for cat_id in cat_ids})
+ repeat_factors.append(repeat_factor)
+
+ return repeat_factors
+
+ def __getitem__(self, idx):
+ ori_index = self.repeat_indices[idx]
+ return self.dataset[ori_index]
+
+ def __len__(self):
+ """Length after repetition."""
+ return len(self.repeat_indices)
+
+@DATASETS.register_module()
+class CBGSDataset(object):
+ """A wrapper of class sampled dataset with ann_file path. Implementation of
+ paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
+ Detection `_.
+
+ Balance the number of scenes under different classes.
+
+ Args:
+ dataset (:obj:`CustomDataset`): The dataset to be class sampled.
+ """
+
+ def __init__(self, dataset):
+ self.dataset = dataset
+ self.CLASSES = dataset.CLASSES
+ self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+ self.sample_indices = self._get_sample_indices()
+ # self.dataset.data_infos = self.data_infos
+ if hasattr(self.dataset, 'flag'):
+ self.flag = np.array(
+ [self.dataset.flag[ind] for ind in self.sample_indices],
+ dtype=np.uint8)
+
+ def _get_sample_indices(self):
+ """Load annotations from ann_file.
+
+ Args:
+ ann_file (str): Path of the annotation file.
+
+ Returns:
+ list[dict]: List of annotations after class sampling.
+ """
+ class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
+ for idx in range(len(self.dataset)):
+ sample_cat_ids = self.dataset.get_cat_ids(idx)
+ for cat_id in sample_cat_ids:
+ class_sample_idxs[cat_id].append(idx)
+ duplicated_samples = sum(
+ [len(v) for _, v in class_sample_idxs.items()])
+ class_distribution = {
+ k: len(v) / duplicated_samples
+ for k, v in class_sample_idxs.items()
+ }
+
+ sample_indices = []
+
+ frac = 1.0 / len(self.CLASSES)
+ ratios = [frac / v for v in class_distribution.values()]
+ for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
+ sample_indices += np.random.choice(cls_inds,
+ int(len(cls_inds) *
+ ratio)).tolist()
+ return sample_indices
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ ori_idx = self.sample_indices[idx]
+ return self.dataset[ori_idx]
+
+ def __len__(self):
+ """Return the length of data infos.
+
+ Returns:
+ int: Length of data infos.
+ """
+ return len(self.sample_indices)
diff --git a/mmcv/datasets/dd3d_nuscenes_dataset.py b/mmcv/datasets/dd3d_nuscenes_dataset.py
new file mode 100644
index 0000000..6c77617
--- /dev/null
+++ b/mmcv/datasets/dd3d_nuscenes_dataset.py
@@ -0,0 +1,359 @@
+# Copyright 2021 Toyota Research Institute. All rights reserved.
+#import functools
+from collections import OrderedDict
+
+import numpy as np
+import seaborn as sns
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from mmcv.structures import BoxMode
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.splits import create_splits_scenes
+
+#from tridet.data import collect_dataset_dicts
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.pose import Pose
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
+
+# https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
+# - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
+# - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
+# - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
+# tracker if required
+DATASET_NAME_TO_VERSION = {
+ "nusc_train": "v1.0-trainval",
+ "nusc_val": "v1.0-trainval",
+ "nusc_val-subsample-8": "v1.0-trainval",
+ "nusc_trainval": "v1.0-trainval",
+ "nusc_test": "v1.0-test",
+ "nusc_mini_train": "v1.0-mini",
+ "nusc_mini_val": "v1.0-mini",
+}
+
+CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
+
+ATTRIBUTE_IDS = {
+ 'vehicle.moving': 0,
+ 'vehicle.parked': 1,
+ 'vehicle.stopped': 2,
+ 'pedestrian.moving': 0,
+ 'pedestrian.standing': 1,
+ 'pedestrian.sitting_lying_down': 2,
+ 'cycle.with_rider': 0,
+ 'cycle.without_rider': 1,
+}
+
+CATEGORY_IDS = OrderedDict({
+ 'barrier': 0,
+ 'bicycle': 1,
+ 'bus': 2,
+ 'car': 3,
+ 'construction_vehicle': 4,
+ 'motorcycle': 5,
+ 'pedestrian': 6,
+ 'traffic_cone': 7,
+ 'trailer': 8,
+ 'truck': 9,
+})
+
+COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
+COLORMAP = OrderedDict({
+ 'barrier': COLORS[8], # yellow
+ 'bicycle': COLORS[0], # blue
+ 'bus': COLORS[6], # pink
+ 'car': COLORS[2], # green
+ 'construction_vehicle': COLORS[7], # gray
+ 'motorcycle': COLORS[4], # purple
+ 'pedestrian': COLORS[1], # orange
+ 'traffic_cone': COLORS[3], # red
+ 'trailer': COLORS[9], # skyblue
+ 'truck': COLORS[5], # brown
+})
+
+MAX_NUM_ATTRIBUTES = 3
+
+
+def _compute_iou(box1, box2):
+ """
+ Parameters
+ ----------
+ box1, box2:
+ (x1, y1, x2, y2)
+ """
+ xx1 = max(box1[0], box2[0])
+ yy1 = max(box1[1], box2[1])
+ xx2 = min(box1[2], box2[2])
+ yy2 = min(box1[3], box2[3])
+ if xx1 >= xx2 or yy1 >= yy2:
+ return 0.
+ inter = (xx2 - xx1) * (yy2 - yy1)
+ a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+ a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+ return inter / (a1 + a2 - inter)
+
+
+class DD3DNuscenesDataset(Dataset):
+ def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
+ self.data_root = data_root
+ assert name in DATASET_NAME_TO_VERSION
+ version = DATASET_NAME_TO_VERSION[name]
+ self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
+
+ self.datum_names = datum_names
+ self.min_num_lidar_points = min_num_lidar_points
+ self.min_box_visibility = min_box_visibility
+
+ self.dataset_item_info = self._build_dataset_item_info(name)
+
+ # Index instance tokens to their IDs
+ self._instance_token_to_id = self._index_instance_tokens()
+
+ # Construct the mapping from datum_token (image id) to index
+ print("Generating the mapping from image id to idx...")
+ self.datumtoken2idx = {}
+ for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
+ self.datumtoken2idx[datum_token] = idx
+ print("Done.")
+
+ def _build_dataset_item_info(self, name):
+ scenes_in_split = self._get_split_scenes(name)
+
+ dataset_items = []
+ for _, scene_token in tqdm(scenes_in_split):
+ scene = self.nusc.get('scene', scene_token)
+ sample_token = scene['first_sample_token']
+ for sample_idx in range(scene['nbr_samples']):
+ if name.endswith('subsample-8') and sample_idx % 8 > 0:
+ # Sample-level subsampling.
+ continue
+
+ sample = self.nusc.get('sample', sample_token)
+ for datum_name, datum_token in sample['data'].items():
+ if datum_name not in self.datum_names:
+ continue
+ dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
+ sample_token = sample['next']
+ return dataset_items
+
+ def _get_split_scenes(self, name):
+ scenes_in_splits = create_splits_scenes()
+ if name == "nusc_trainval":
+ scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
+ elif name == "nusc_val-subsample-8":
+ scenes = scenes_in_splits["val"]
+ else:
+ assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
+ split = name[5:]
+ assert split in scenes_in_splits, f"Invalid dataset: {split}"
+ scenes = scenes_in_splits[split]
+
+ # Mapping from scene name to token.
+ name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
+ return [(name, name_to_token[name]) for name in scenes]
+
+ def __len__(self):
+ return len(self.dataset_item_info)
+
+ def _build_id(self, scene_name, sample_idx, datum_name):
+ sample_id = f"{scene_name}_{sample_idx:03d}"
+ image_id = f"{sample_id}_{datum_name}"
+ return image_id, sample_id
+
+ def _index_instance_tokens(self):
+ """Index instance tokens for uniquely identifying instances across samples"""
+ instance_token_to_id = {}
+ for record in self.nusc.sample_annotation:
+ instance_token = record['instance_token']
+ if instance_token not in instance_token_to_id:
+ next_instance_id = len(instance_token_to_id)
+ instance_token_to_id[instance_token] = next_instance_id
+ return instance_token_to_id
+
+ def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
+ annotations = []
+ for _ann in annotation_list:
+ ann = self.nusc.get('sample_annotation', _ann.token)
+ if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
+ continue
+ annotation = OrderedDict()
+
+ # --------
+ # Category
+ # --------
+ category = category_to_detection_name(ann['category_name'])
+ if category is None:
+ continue
+ annotation['category_id'] = CATEGORY_IDS[category]
+
+ # ------
+ # 3D box
+ # ------
+ # NOTE: ann['rotation'], ann['translation'] is in global frame.
+ pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center) # pose in sensor frame
+ # DEBUG:
+ # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
+ # pose_WO_2 = pose_WS * pose_SO
+ # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
+ bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
+ annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
+
+ # --------------------------------------
+ # 2D box -- project 8 corners of 3D bbox
+ # --------------------------------------
+ corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
+ l, t = corners[:, 0].min(), corners[:, 1].min()
+ r, b = corners[:, 0].max(), corners[:, 1].max()
+
+ x1 = max(0, l)
+ y1 = max(0, t)
+ x2 = min(image_shape[1], r)
+ y2 = min(image_shape[0], b)
+
+ iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
+ if iou < self.min_box_visibility:
+ continue
+
+ annotation['bbox'] = [x1, y1, x2, y2]
+ annotation['bbox_mode'] = BoxMode.XYXY_ABS
+
+ # --------
+ # Track ID
+ # --------
+ annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
+
+ # ---------
+ # Attribute
+ # ---------
+ attr_tokens = ann['attribute_tokens']
+ assert len(attr_tokens) < 2 # NOTE: Allow only single attrubute.
+ attribute_id = MAX_NUM_ATTRIBUTES # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
+ if attr_tokens:
+ attribute = self.nusc.get('attribute', attr_tokens[0])['name']
+ attribute_id = ATTRIBUTE_IDS[attribute]
+ annotation['attribute_id'] = attribute_id
+
+ # -----
+ # Speed
+ # -----
+ vel_global = self.nusc.box_velocity(ann['token'])
+ speed = np.linalg.norm(vel_global) # NOTE: This can be NaN.
+ # DEBUG:
+ # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
+ annotation['speed'] = speed
+
+ annotations.append(annotation)
+
+ return annotations
+
+ def _get_ego_velocity(self, current, max_time_diff=1.5):
+ """Velocity of ego-vehicle in m/s.
+ """
+ has_prev = current['prev'] != ''
+ has_next = current['next'] != ''
+
+ # Cannot estimate velocity for a single annotation.
+ if not has_prev and not has_next:
+ return np.array([np.nan, np.nan, np.nan])
+
+ if has_prev:
+ first = self.nusc.get('sample_data', current['prev'])
+ else:
+ first = current
+
+ if has_next:
+ last = self.nusc.get('sample_data', current['next'])
+ else:
+ last = current
+
+ pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
+ pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
+ pos_diff = np.float32(pos_last) - np.float32(pos_first)
+
+ time_last = 1e-6 * last['timestamp']
+ time_first = 1e-6 * first['timestamp']
+ time_diff = time_last - time_first
+
+ if has_next and has_prev:
+ # If doing centered difference, allow for up to double the max_time_diff.
+ max_time_diff *= 2
+
+ if time_diff > max_time_diff:
+ # If time_diff is too big, don't return an estimate.
+ return np.array([np.nan, np.nan, np.nan])
+ else:
+ return pos_diff / time_diff
+
+ def __getitem__(self, idx):
+ datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
+ datum = self.nusc.get('sample_data', datum_token)
+ assert datum['is_key_frame']
+
+ filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+ image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
+ height, width = datum['height'], datum['width']
+ d2_dict = OrderedDict(
+ file_name=filename,
+ height=height,
+ width=width,
+ image_id=image_id,
+ sample_id=sample_id,
+ sample_token=sample_token
+ )
+
+ # Intrinsics
+ d2_dict['intrinsics'] = list(K.flatten())
+
+ # Get pose of the sensor (S) from vehicle (V) frame
+ _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+ pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+
+ # Get ego-pose of the vehicle (V) from global/world (W) frame
+ _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+ pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+ pose_WS = pose_WV * pose_VS
+
+ d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+ d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+ d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+ d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+
+ return d2_dict
+
+ def getitem_by_datumtoken(self, datum_token):
+ # idx = self.datumtoken2idx[datum_token]
+ # ret = self.__getitem__(idx)
+
+ datum = self.nusc.get('sample_data', datum_token)
+ sample_token = datum['sample_token']
+ filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+ height, width = datum['height'], datum['width']
+ d2_dict = OrderedDict(
+ file_name=filename,
+ height=height,
+ width=width,
+ image_id=0,
+ sample_id=0,
+ sample_token=sample_token
+ )
+ # Intrinsics
+ d2_dict['intrinsics'] = list(K.flatten())
+ # Get pose of the sensor (S) from vehicle (V) frame
+ _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+ pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+ # Get ego-pose of the vehicle (V) from global/world (W) frame
+ _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+ pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+ pose_WS = pose_WV * pose_VS
+
+ d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+ d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+ d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+ d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+ return d2_dict
\ No newline at end of file
diff --git a/mmcv/datasets/eval_utils/eval_utils.py b/mmcv/datasets/eval_utils/eval_utils.py
new file mode 100644
index 0000000..9d56923
--- /dev/null
+++ b/mmcv/datasets/eval_utils/eval_utils.py
@@ -0,0 +1,911 @@
+import json
+import torch
+import tqdm
+from typing import List, Dict, Tuple, Callable, Union
+from nuscenes import NuScenes
+from pyquaternion import Quaternion
+import numpy as np
+from .metric_utils import min_ade, min_fde, miss_rate
+
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.prediction import PredictHelper, convert_local_coords_to_global
+from nuscenes.eval.common.data_classes import EvalBox, EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.data_classes import DetectionMetricData, DetectionMetricDataList, DetectionMetrics
+from nuscenes.eval.common.utils import center_distance, scale_iou, yaw_diff, velocity_l2, attr_acc, cummean
+
+def category_to_motion_name(category_name: str):
+ """
+ Default label mapping from nuScenes to nuScenes detection classes.
+ Note that pedestrian does not include personal_mobility, stroller and wheelchair.
+ :param category_name: Generic nuScenes class.
+ :return: nuScenes detection class.
+ """
+ detection_mapping = {
+ 'movable_object.barrier': 'barrier',
+ 'vehicle.bicycle': 'car',
+ 'vehicle.bus.bendy': 'car',
+ 'vehicle.bus.rigid': 'car',
+ 'vehicle.car': 'car',
+ 'vehicle.construction': 'car',
+ 'vehicle.motorcycle': 'car',
+ 'human.pedestrian.adult': 'pedestrian',
+ 'human.pedestrian.child': 'pedestrian',
+ 'human.pedestrian.construction_worker': 'pedestrian',
+ 'human.pedestrian.police_officer': 'pedestrian',
+ 'movable_object.trafficcone': 'barrier',
+ 'vehicle.trailer': 'car',
+ 'vehicle.truck': 'car'
+ }
+
+ if category_name in detection_mapping:
+ return detection_mapping[category_name]
+ else:
+ return None
+
+def detection_prediction_category_to_motion_name(category_name: str):
+ """
+ Default label mapping from nuScenes to nuScenes detection classes.
+ Note that pedestrian does not include personal_mobility, stroller and wheelchair.
+ :param category_name: Generic nuScenes class.
+ :return: nuScenes detection class.
+ """
+ detection_mapping = {
+ 'car': 'car',
+ 'truck': 'car',
+ 'construction_vehicle': 'car',
+ 'bus': 'car',
+ 'trailer': 'car',
+ 'motorcycle': 'car',
+ 'bicycle': 'car',
+ 'pedestrian': 'pedestrian',
+ 'traffic_cone': 'barrier',
+ 'barrier': 'barrier',
+ }
+
+ if category_name in detection_mapping:
+ return detection_mapping[category_name]
+ else:
+ return None
+
+class DetectionMotionMetrics(DetectionMetrics):
+ """ Stores average precision and true positive metric results. Provides properties to summarize. """
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized dictionary. """
+
+ cfg = DetectionConfig.deserialize(content['cfg'])
+ metrics = cls(cfg=cfg)
+ metrics.add_runtime(content['eval_time'])
+
+ for detection_name, label_aps in content['label_aps'].items():
+ for dist_th, ap in label_aps.items():
+ metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap))
+
+ for detection_name, label_tps in content['label_tp_errors'].items():
+ for metric_name, tp in label_tps.items():
+ metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp))
+
+ return metrics
+
+class DetectionMotionMetricDataList(DetectionMetricDataList):
+ """ This stores a set of MetricData in a dict indexed by (name, match-distance). """
+ @classmethod
+ def deserialize(cls, content: dict):
+ mdl = cls()
+ for key, md in content.items():
+ name, distance = key.split(':')
+ mdl.set(name, float(distance), DetectionMotionMetricData.deserialize(md))
+ return mdl
+
+class DetectionMotionMetricData(DetectionMetricData):
+ """ This class holds accumulated and interpolated data required to calculate the detection metrics. """
+
+ nelem = 101
+
+ def __init__(self,
+ recall: np.array,
+ precision: np.array,
+ confidence: np.array,
+ trans_err: np.array,
+ vel_err: np.array,
+ scale_err: np.array,
+ orient_err: np.array,
+ attr_err: np.array,
+ min_ade_err: np.array,
+ min_fde_err: np.array,
+ miss_rate_err: np.array):
+
+ # Assert lengths.
+ assert len(recall) == self.nelem
+ assert len(precision) == self.nelem
+ assert len(confidence) == self.nelem
+ assert len(trans_err) == self.nelem
+ assert len(vel_err) == self.nelem
+ assert len(scale_err) == self.nelem
+ assert len(orient_err) == self.nelem
+ assert len(attr_err) == self.nelem
+ assert len(min_ade_err) == self.nelem
+ assert len(min_fde_err) == self.nelem
+ assert len(miss_rate_err) == self.nelem
+
+ # Assert ordering.
+ assert all(confidence == sorted(confidence, reverse=True)) # Confidences should be descending.
+ assert all(recall == sorted(recall)) # Recalls should be ascending.
+
+ # Set attributes explicitly to help IDEs figure out what is going on.
+ self.recall = recall
+ self.precision = precision
+ self.confidence = confidence
+ self.trans_err = trans_err
+ self.vel_err = vel_err
+ self.scale_err = scale_err
+ self.orient_err = orient_err
+ self.attr_err = attr_err
+ self.min_ade_err = min_ade_err
+ self.min_fde_err = min_fde_err
+ self.miss_rate_err = miss_rate_err
+
+ def __eq__(self, other):
+ eq = True
+ for key in self.serialize().keys():
+ eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+ return eq
+
+ @property
+ def max_recall_ind(self):
+ """ Returns index of max recall achieved. """
+
+ # Last instance of confidence > 0 is index of max achieved recall.
+ non_zero = np.nonzero(self.confidence)[0]
+ if len(non_zero) == 0: # If there are no matches, all the confidence values will be zero.
+ max_recall_ind = 0
+ else:
+ max_recall_ind = non_zero[-1]
+
+ return max_recall_ind
+
+ @property
+ def max_recall(self):
+ """ Returns max recall achieved. """
+
+ return self.recall[self.max_recall_ind]
+
+ def serialize(self):
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'recall': self.recall.tolist(),
+ 'precision': self.precision.tolist(),
+ 'confidence': self.confidence.tolist(),
+ 'trans_err': self.trans_err.tolist(),
+ 'vel_err': self.vel_err.tolist(),
+ 'scale_err': self.scale_err.tolist(),
+ 'orient_err': self.orient_err.tolist(),
+ 'attr_err': self.attr_err.tolist(),
+ 'min_ade_err': self.min_ade_err.tolist(),
+ 'min_fde_err': self.min_fde_err.tolist(),
+ 'miss_rate_err': self.miss_rate_err.tolist(),
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(recall=np.array(content['recall']),
+ precision=np.array(content['precision']),
+ confidence=np.array(content['confidence']),
+ trans_err=np.array(content['trans_err']),
+ vel_err=np.array(content['vel_err']),
+ scale_err=np.array(content['scale_err']),
+ orient_err=np.array(content['orient_err']),
+ attr_err=np.array(content['attr_err']),
+ min_ade_err=np.array(content['min_ade_err']),
+ min_fde_err=np.array(content['min_fde_err']),
+ miss_rate_err=np.array(content['miss_rate_err']))
+
+ @classmethod
+ def no_predictions(cls):
+ """ Returns a md instance corresponding to having no predictions. """
+ return cls(recall=np.linspace(0, 1, cls.nelem),
+ precision=np.zeros(cls.nelem),
+ confidence=np.zeros(cls.nelem),
+ trans_err=np.ones(cls.nelem),
+ vel_err=np.ones(cls.nelem),
+ scale_err=np.ones(cls.nelem),
+ orient_err=np.ones(cls.nelem),
+ attr_err=np.ones(cls.nelem),
+ min_ade_err=np.ones(cls.nelem),
+ min_fde_err=np.ones(cls.nelem),
+ miss_rate_err=np.ones(cls.nelem))
+
+ @classmethod
+ def random_md(cls):
+ """ Returns an md instance corresponding to a random results. """
+ return cls(recall=np.linspace(0, 1, cls.nelem),
+ precision=np.random.random(cls.nelem),
+ confidence=np.linspace(0, 1, cls.nelem)[::-1],
+ trans_err=np.random.random(cls.nelem),
+ vel_err=np.random.random(cls.nelem),
+ scale_err=np.random.random(cls.nelem),
+ orient_err=np.random.random(cls.nelem),
+ attr_err=np.random.random(cls.nelem),
+ min_ade_err=np.random.random(cls.nelem),
+ min_fde_err=np.random.random(cls.nelem),
+ miss_rate_err=np.random.random(cls.nelem))
+
+
+class DetectionMotionBox(DetectionBox):
+ def __init__(self,
+ sample_token: str = "",
+ translation: Tuple[float, float, float] = (0, 0, 0),
+ size: Tuple[float, float, float] = (0, 0, 0),
+ rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+ velocity: Tuple[float, float] = (0, 0),
+ ego_translation: [float, float, float] = (0, 0, 0), # Translation to ego vehicle in meters.
+ num_pts: int = -1, # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+ detection_name: str = 'car', # The class name used in the detection challenge.
+ detection_score: float = -1.0, # GT samples do not have a score.
+ attribute_name: str = '',
+ traj=None,
+ traj_scores=None): # Box attribute. Each box can have at most 1 attribute.
+ super(DetectionBox, self).__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts)
+ assert detection_name is not None, 'Error: detection_name cannot be empty!'
+ # assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+ # assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+ # 'Error: Unknown attribute_name %s' % attribute_name
+
+ assert type(detection_score) == float, 'Error: detection_score must be a float!'
+ assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+
+ # Assign.
+ self.detection_name = detection_name
+ self.attribute_name = attribute_name
+ self.detection_score = detection_score
+ self.traj = traj
+ self.traj_scores = traj_scores
+ self.traj_index = None
+
+ def __eq__(self, other):
+ return (self.sample_token == other.sample_token and
+ self.translation == other.translation and
+ self.size == other.size and
+ self.rotation == other.rotation and
+ self.velocity == other.velocity and
+ self.ego_translation == other.ego_translation and
+ self.num_pts == other.num_pts and
+ self.detection_name == other.detection_name and
+ self.detection_score == other.detection_score and
+ self.attribute_name == other.attribute_name and
+ np.all(self.traj == other.traj) and
+ np.all(self.traj_scores == other.traj_scores))
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'traj': self.traj,
+ 'traj_scores': self.traj_scores
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'],
+ traj=content['predict_traj'],
+ traj_scores=content['predict_traj_score'])
+
+class DetectionMotionBox_modified(DetectionMotionBox):
+ def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+ '''
+ add annotation token
+ '''
+ super().__init__(*args, **kwargs)
+ self.token = token
+ self.visibility = visibility
+ self.index = index
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'token': self.token,
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'visibility': self.visibility,
+ 'index': self.index,
+ 'traj': self.traj,
+ 'traj_scores': self.traj_scores
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(
+ token=content['token'],
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'],
+ visibility=content['visibility'],
+ index=content['index'],
+ traj=content['traj'],
+ )
+
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False, category_convert_type='detection_category') \
+ -> Tuple[EvalBoxes, Dict]:
+ """
+ Loads object predictions from file.
+ :param result_path: Path to the .json result file provided by the user.
+ :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+ :param box_cls: Type of box to load, e.g. DetectionBox, DetectionMotionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The deserialized results and meta data.
+ """
+
+ # Load from file and check that the format is correct.
+ with open(result_path) as f:
+ data = json.load(f)
+ assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+ 'See https://www.nuscenes.org/object-detection for more information.'
+
+ if category_convert_type == 'motion_category':
+ for key in data['results'].keys():
+ for i in range(len(data['results'][key])):
+ data['results'][key][i]['detection_name'] = detection_prediction_category_to_motion_name(data['results'][key][i]['detection_name'])
+ # Deserialize results and get meta data.
+ all_results = EvalBoxes.deserialize(data['results'], box_cls)
+ meta = data['meta']
+ if verbose:
+ print("Loaded results from {}. Found detections for {} samples."
+ .format(result_path, len(all_results.sample_tokens)))
+
+ # Check that each sample has no more than x predicted boxes.
+ for sample_token in all_results.sample_tokens:
+ assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+ "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+ return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False, category_convert_type='detection_category'):
+ """
+ Loads ground truth boxes from DB.
+ :param nusc: A NuScenes instance.
+ :param eval_split: The evaluation split for which we load GT boxes.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The GT boxes.
+ """
+ predict_helper = PredictHelper(nusc)
+ # Init.
+ if box_cls == DetectionMotionBox_modified:
+ attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+ if verbose:
+ print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+ # Read out all sample_tokens in DB.
+ sample_tokens_all = [s['token'] for s in nusc.sample]
+ assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+ # Only keep samples from this split.
+ splits = create_splits_scenes()
+
+ # Check compatibility of split with nusc_version.
+ version = nusc.version
+ if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+ assert version.endswith('trainval'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split in {'mini_train', 'mini_val'}:
+ assert version.endswith('mini'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split == 'test':
+ assert version.endswith('test'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ else:
+ raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+ .format(eval_split))
+
+ if eval_split == 'test':
+ # Check that you aren't trying to cheat :).
+ assert len(nusc.sample_annotation) > 0, \
+ 'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+ index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ index_map[sample['token']] = index
+ index += 1
+
+ sample_tokens = []
+ for sample_token in sample_tokens_all:
+ scene_token = nusc.get('sample', sample_token)['scene_token']
+ scene_record = nusc.get('scene', scene_token)
+ if scene_record['name'] in splits[eval_split]:
+ sample_tokens.append(sample_token)
+
+ all_annotations = EvalBoxes()
+
+ # Load annotations and filter predictions and annotations.
+ tracking_id_set = set()
+ for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+ sample = nusc.get('sample', sample_token)
+ sample_annotation_tokens = sample['anns']
+
+ sample_boxes = []
+ for sample_annotation_token in sample_annotation_tokens:
+
+ sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+ if box_cls == DetectionMotionBox_modified:
+ # Get label name in detection task and filter unused labels.
+ if category_convert_type == 'detection_category':
+ detection_name = category_to_detection_name(sample_annotation['category_name'])
+ elif category_convert_type == 'motion_category':
+ detection_name = category_to_motion_name(sample_annotation['category_name'])
+ else:
+ raise NotImplementedError
+ if detection_name is None:
+ continue
+ # Get attribute_name.
+ attr_tokens = sample_annotation['attribute_tokens']
+ attr_count = len(attr_tokens)
+ if attr_count == 0:
+ attribute_name = ''
+ elif attr_count == 1:
+ attribute_name = attribute_map[attr_tokens[0]]
+ else:
+ raise Exception('Error: GT annotations must not have more than one attribute!')
+ instance_token = nusc.get('sample_annotation', sample_annotation['token'])['instance_token']
+ fut_traj_local = predict_helper.get_future_for_agent(instance_token, sample_token, seconds=6, in_agent_frame=True)
+ fut_traj_scence_centric = np.zeros((0,))
+ if fut_traj_local.shape[0] > 0:
+ _, boxes, _ = nusc.get_sample_data(sample['data']['LIDAR_TOP'], selected_anntokens=[sample_annotation['token']])
+ box = boxes[0]
+ trans = box.center
+ rot = Quaternion(matrix=box.rotation_matrix)
+ fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot)
+
+ sample_boxes.append(
+ box_cls(
+ token=sample_annotation_token,
+ sample_token=sample_token,
+ translation=sample_annotation['translation'],
+ size=sample_annotation['size'],
+ rotation=sample_annotation['rotation'],
+ velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+ num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+ detection_name=detection_name,
+ detection_score=-1.0, # GT samples do not have a score.
+ attribute_name=attribute_name,
+ visibility=sample_annotation['visibility_token'],
+ index=index_map[sample_token],
+ traj=fut_traj_scence_centric,
+ )
+ )
+ elif box_cls == TrackingBox:
+ assert False
+ else:
+ raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+ all_annotations.add_boxes(sample_token, sample_boxes)
+
+ if verbose:
+ print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+ return all_annotations
+
+def prediction_metrics(gt_box_match, pred_box):
+ pred_traj = np.array(pred_box.traj)
+ gt_traj_steps = gt_box_match.traj.reshape((-1, 2))
+ valid_steps = gt_traj_steps.shape[0]
+ if valid_steps <= 0:
+ return np.array([0]), np.array([0]), 0
+ nmodes = pred_traj.shape[0]
+ pred_steps = pred_traj.shape[1]
+ valid_mask = np.zeros((pred_steps, ))
+ gt_traj = np.zeros((pred_steps, 2))
+ gt_traj[:valid_steps, :] = gt_traj_steps
+ valid_mask[: valid_steps] = 1
+ pred_traj = torch.tensor(pred_traj[None])
+ gt_traj = torch.tensor(gt_traj[None])
+ valid_mask = torch.tensor(valid_mask[None])
+ ade_err, inds = min_ade(pred_traj, gt_traj, 1 - valid_mask)
+ fde_err, inds = min_fde(pred_traj, gt_traj, 1 - valid_mask)
+ mr_err = miss_rate(pred_traj, gt_traj, 1 - valid_mask, dist_thresh=2)
+ return ade_err.numpy(), fde_err.numpy(), mr_err.numpy()
+
+
+def accumulate(gt_boxes: EvalBoxes,
+ pred_boxes: EvalBoxes,
+ class_name: str,
+ dist_fcn: Callable,
+ dist_th: float,
+ verbose: bool = False) -> DetectionMotionMetricData:
+ """
+ Average Precision over predefined different recall thresholds for a single distance threshold.
+ The recall/conf thresholds and other raw metrics will be used in secondary metrics.
+ :param gt_boxes: Maps every sample_token to a list of its sample_annotations.
+ :param pred_boxes: Maps every sample_token to a list of its sample_results.
+ :param class_name: Class to compute AP on.
+ :param dist_fcn: Distance function used to match detections and ground truths.
+ :param dist_th: Distance threshold for a match.
+ :param verbose: If true, print debug messages.
+ :return: (average_prec, metrics). The average precision value and raw data for a number of metrics.
+ """
+ # ---------------------------------------------
+ # Organize input and initialize accumulators.
+ # ---------------------------------------------
+
+ # Count the positives.
+ npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name])
+ if verbose:
+ print("Found {} GT of class {} out of {} total across {} samples.".
+ format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens)))
+
+ # For missing classes in the GT, return a data structure corresponding to no predictions.
+ if npos == 0:
+ return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+ # Organize the predictions in a single list.
+ pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name]
+ pred_confs = [box.detection_score for box in pred_boxes_list]
+
+ if verbose:
+ print("Found {} PRED of class {} out of {} total across {} samples.".
+ format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens)))
+
+ # Sort by confidence.
+ sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1]
+
+ # Do the actual matching.
+ tp = [] # Accumulator of true positives.
+ fp = [] # Accumulator of false positives.
+ conf = [] # Accumulator of confidences.
+
+ # match_data holds the extra metrics we calculate for each match.
+ match_data = {'trans_err': [],
+ 'vel_err': [],
+ 'scale_err': [],
+ 'orient_err': [],
+ 'attr_err': [],
+ 'conf': [],
+ 'min_ade_err': [],
+ 'min_fde_err': [],
+ 'miss_rate_err': []}
+
+ # ---------------------------------------------
+ # Match and accumulate match data.
+ # ---------------------------------------------
+
+ taken = set() # Initially no gt bounding box is matched.
+ for ind in sortind:
+ pred_box = pred_boxes_list[ind]
+ min_dist = np.inf
+ match_gt_idx = None
+
+ for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]):
+
+ # Find closest match among ground truth boxes
+ if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken:
+ this_distance = dist_fcn(gt_box, pred_box)
+ if this_distance < min_dist:
+ min_dist = this_distance
+ match_gt_idx = gt_idx
+
+ # If the closest match is close enough according to threshold we have a match!
+ is_match = min_dist < dist_th
+
+ if is_match:
+ taken.add((pred_box.sample_token, match_gt_idx))
+
+ # Update tp, fp and confs.
+ tp.append(1)
+ fp.append(0)
+ conf.append(pred_box.detection_score)
+
+ # Since it is a match, update match data also.
+ gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx]
+
+ match_data['trans_err'].append(center_distance(gt_box_match, pred_box))
+ match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box))
+ match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box))
+
+ # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later)
+ period = np.pi if class_name == 'barrier' else 2 * np.pi
+ match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period))
+
+ match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box))
+ minade, minfde, m_r = prediction_metrics(gt_box_match, pred_box)
+
+ match_data['min_ade_err'].append(minade)
+ match_data['min_fde_err'].append(minfde)
+ match_data['miss_rate_err'].append(m_r)
+ match_data['conf'].append(pred_box.detection_score)
+
+ else:
+ # No match. Mark this as a false positive.
+ tp.append(0)
+ fp.append(1)
+ conf.append(pred_box.detection_score)
+
+ # Check if we have any matches. If not, just return a "no predictions" array.
+ if len(match_data['trans_err']) == 0:
+ return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+ # ---------------------------------------------
+ # Calculate and interpolate precision and recall
+ # ---------------------------------------------
+
+ # Accumulate.
+ N_tp = np.sum(tp)
+ N_fp = np.sum(fp)
+ tp = np.cumsum(tp).astype(float)
+ fp = np.cumsum(fp).astype(float)
+ conf = np.array(conf)
+
+
+ # Calculate precision and recall.
+ prec = tp / (fp + tp)
+ rec = tp / float(npos)
+
+ rec_interp = np.linspace(0, 1, DetectionMotionMetricData.nelem) # 101 steps, from 0% to 100% recall.
+ prec = np.interp(rec_interp, rec, prec, right=0)
+ conf = np.interp(rec_interp, rec, conf, right=0)
+ rec = rec_interp
+
+ # ---------------------------------------------
+ # Re-sample the match-data to match, prec, recall and conf.
+ # ---------------------------------------------
+
+ for key in match_data.keys():
+ if key == "conf":
+ continue # Confidence is used as reference to align with fp and tp. So skip in this step.
+
+ else:
+ # For each match_data, we first calculate the accumulated mean.
+ tmp = cummean(np.array(match_data[key]))
+
+ # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays)
+ match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1]
+
+ # ---------------------------------------------
+ # Done. Instantiate MetricData and return
+ # ---------------------------------------------
+ return DetectionMotionMetricData(recall=rec,
+ precision=prec,
+ confidence=conf,
+ trans_err=match_data['trans_err'],
+ vel_err=match_data['vel_err'],
+ scale_err=match_data['scale_err'],
+ orient_err=match_data['orient_err'],
+ attr_err=match_data['attr_err'],
+ min_ade_err=match_data['min_ade_err'],
+ min_fde_err=match_data['min_fde_err'],
+ miss_rate_err=match_data['miss_rate_err']
+ ), N_tp, N_fp, npos
+
+
+
+def accumulate_motion(gt_boxes: EvalBoxes,
+ pred_boxes: EvalBoxes,
+ class_name: str,
+ dist_fcn: Callable,
+ traj_fcn: Callable,
+ dist_th: float,
+ traj_dist_th: float,
+ verbose: bool = False,
+ final_step: float = 12) -> DetectionMotionMetricData:
+ """
+ Average Precision over predefined different recall thresholds for a single distance threshold.
+ The recall/conf thresholds and other raw metrics will be used in secondary metrics.
+ :param gt_boxes: Maps every sample_token to a list of its sample_annotations.
+ :param pred_boxes: Maps every sample_token to a list of its sample_results.
+ :param class_name: Class to compute AP on.
+ :param dist_fcn: Distance function used to match detections and ground truths.
+ :param dist_th: Distance threshold for a match.
+ :param verbose: If true, print debug messages.
+ :return: (average_prec, metrics). The average precision value and raw data for a number of metrics.
+ """
+ # ---------------------------------------------
+ # Organize input and initialize accumulators.
+ # ---------------------------------------------
+
+ # Count the positives.
+ npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name])
+ if verbose:
+ print("Found {} GT of class {} out of {} total across {} samples.".
+ format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens)))
+
+ # For missing classes in the GT, return a data structure corresponding to no predictions.
+ if npos == 0:
+ return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+ #
+ # Organize the predictions in a single list.
+ pred_boxes_list = []
+ pred_confs = []
+
+ pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name]
+ pred_confs = [box.detection_score for box in pred_boxes_list]
+ # for box in pred_boxes.all:
+ # if box.detection_name == class_name:
+ # box.traj_scores = np.exp(box.traj_scores)
+ # for i in range(len(box.traj_scores)):
+ # box.traj_index = i
+ # pred_boxes_list.append(box)
+ # pred_confs = [box.detection_score * box.traj_scores[box.traj_index] for box in pred_boxes_list]
+
+ if verbose:
+ print("Found {} PRED of class {} out of {} total across {} samples.".
+ format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens)))
+
+ # Sort by confidence.
+ sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1]
+
+ # Do the actual matching.
+ tp = [] # Accumulator of true positives.
+ fp = [] # Accumulator of false positives.
+ conf = [] # Accumulator of confidences.
+
+ # match_data holds the extra metrics we calculate for each match.
+ match_data = {'trans_err': [],
+ 'vel_err': [],
+ 'scale_err': [],
+ 'orient_err': [],
+ 'attr_err': [],
+ 'conf': [],
+ 'min_ade_err': [],
+ 'min_fde_err': [],
+ 'miss_rate_err': []}
+
+ # ---------------------------------------------
+ # Match and accumulate match data.
+ # ---------------------------------------------
+
+ taken = set() # Initially no gt bounding box is matched.
+ for ind in sortind:
+ pred_box = pred_boxes_list[ind]
+ min_dist = np.inf
+ match_gt_idx = None
+
+ for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]):
+
+ # Find closest match among ground truth boxes
+ if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken:
+ this_distance = dist_fcn(gt_box, pred_box)
+ if this_distance < min_dist:
+ min_dist = this_distance
+ match_gt_idx = gt_idx
+ fde_distance = traj_fcn(gt_box, pred_box, final_step)
+ # If the closest match is close enough according to threshold we have a match!
+ is_match = min_dist < dist_th and fde_distance < traj_dist_th
+
+ if is_match:
+ taken.add((pred_box.sample_token, match_gt_idx))
+
+ # Update tp, fp and confs.
+ tp.append(1)
+ fp.append(0)
+ conf.append(pred_box.detection_score)
+
+ # Since it is a match, update match data also.
+ gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx]
+
+ match_data['trans_err'].append(center_distance(gt_box_match, pred_box))
+ match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box))
+ match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box))
+
+ # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later)
+ period = np.pi if class_name == 'barrier' else 2 * np.pi
+ match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period))
+
+ match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box))
+ minade, minfde, m_r = prediction_metrics(gt_box_match, pred_box)
+
+ match_data['min_ade_err'].append(minade)
+ match_data['min_fde_err'].append(minfde)
+ match_data['miss_rate_err'].append(m_r)
+ match_data['conf'].append(pred_box.detection_score)
+
+ else:
+ # No match. Mark this as a false positive.
+ tp.append(0)
+ fp.append(1)
+ conf.append(pred_box.detection_score)
+ # conf.append(pred_box.detection_score * pred_box.traj_scores[pred_box.traj_index])
+ #
+ # Check if we have any matches. If not, just return a "no predictions" array.
+ if len(match_data['trans_err']) == 0:
+ return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+ # ---------------------------------------------
+ # Calculate and interpolate precision and recall
+ # ---------------------------------------------
+
+ # Accumulate.
+ N_tp = np.sum(tp)
+ N_fp = np.sum(fp)
+ tp = np.cumsum(tp).astype(float)
+ fp = np.cumsum(fp).astype(float)
+ conf = np.array(conf)
+
+ # Calculate precision and recall.
+ prec = tp / (fp + tp)
+ rec = tp / float(npos)
+
+
+
+ rec_interp = np.linspace(0, 1, DetectionMotionMetricData.nelem) # 101 steps, from 0% to 100% recall.
+ prec = np.interp(rec_interp, rec, prec, right=0)
+ conf = np.interp(rec_interp, rec, conf, right=0)
+ rec = rec_interp
+
+ # ---------------------------------------------
+ # Re-sample the match-data to match, prec, recall and conf.
+ # ---------------------------------------------
+
+ for key in match_data.keys():
+ if key == "conf":
+ continue # Confidence is used as reference to align with fp and tp. So skip in this step.
+
+ else:
+ # For each match_data, we first calculate the accumulated mean.
+ tmp = cummean(np.array(match_data[key]))
+
+ # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays)
+ match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1]
+
+ # ---------------------------------------------
+ # Done. Instantiate MetricData and return
+ # ---------------------------------------------
+ return DetectionMotionMetricData(recall=rec,
+ precision=prec,
+ confidence=conf,
+ trans_err=match_data['trans_err'],
+ vel_err=match_data['vel_err'],
+ scale_err=match_data['scale_err'],
+ orient_err=match_data['orient_err'],
+ attr_err=match_data['attr_err'],
+ min_ade_err=match_data['min_ade_err'],
+ min_fde_err=match_data['min_fde_err'],
+ miss_rate_err=match_data['miss_rate_err']
+ ), N_tp, N_fp, npos
\ No newline at end of file
diff --git a/mmcv/datasets/eval_utils/map_api.py b/mmcv/datasets/eval_utils/map_api.py
new file mode 100644
index 0000000..5f26e58
--- /dev/null
+++ b/mmcv/datasets/eval_utils/map_api.py
@@ -0,0 +1,2355 @@
+# nuScenes dev-kit.
+# Code written by Sergi Adipraja Widjaja, 2019.
+# + Map mask by Kiwoo Shin, 2019.
+# + Methods operating on NuScenesMap and NuScenes by Holger Caesar, 2019.
+
+import json
+import os
+import random
+from typing import Dict, List, Tuple, Optional, Union
+
+import cv2
+import math
+import descartes
+import matplotlib.gridspec as gridspec
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from matplotlib.patches import Rectangle, Arrow
+from mpl_toolkits.axes_grid1.inset_locator import mark_inset
+from pyquaternion import Quaternion
+from shapely import affinity
+from shapely.geometry import Polygon, MultiPolygon, LineString, Point, box
+from tqdm import tqdm
+
+from nuscenes.map_expansion.arcline_path_utils import discretize_lane, ArcLinePath
+from nuscenes.map_expansion.bitmap import BitMap
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from functools import partial
+
+# Recommended style to use as the plots will show grids.
+plt.style.use('seaborn-whitegrid')
+
+# Define a map geometry type for polygons and lines.
+Geometry = Union[Polygon, LineString]
+
+locations = ['singapore-onenorth', 'singapore-hollandvillage', 'singapore-queenstown', 'boston-seaport']
+
+
+class NuScenesMap:
+ """
+ NuScenesMap database class for querying and retrieving information from the semantic maps.
+ Before using this class please use the provided tutorial `map_expansion_tutorial.ipynb`.
+
+ Below you can find the map origins (south western corner, in [lat, lon]) for each of the 4 maps in nuScenes:
+ boston-seaport: [42.336849169438615, -71.05785369873047]
+ singapore-onenorth: [1.2882100868743724, 103.78475189208984]
+ singapore-hollandvillage: [1.2993652317780957, 103.78217697143555]
+ singapore-queenstown: [1.2782562240223188, 103.76741409301758]
+
+ The dimensions of the maps are as follows ([width, height] in meters):
+ singapore-onenorth: [1585.6, 2025.0]
+ singapore-hollandvillage: [2808.3, 2922.9]
+ singapore-queenstown: [3228.6, 3687.1]
+ boston-seaport: [2979.5, 2118.1]
+ The rasterized semantic maps (e.g. singapore-onenorth.png) published with nuScenes v1.0 have a scale of 10px/m,
+ hence the above numbers are the image dimensions divided by 10.
+
+ We use the same WGS 84 Web Mercator (EPSG:3857) projection as Google Maps/Earth.
+ """
+ def __init__(self,
+ dataroot: str = '/data/sets/nuscenes',
+ map_name: str = 'singapore-onenorth'):
+ """
+ Loads the layers, create reverse indices and shortcuts, initializes the explorer class.
+ :param dataroot: Path to the layers in the form of a .json file.
+ :param map_name: Which map out of `singapore-onenorth`, `singepore-hollandvillage`, `singapore-queenstown`,
+ `boston-seaport` that we want to load.
+ """
+ assert map_name in locations, 'Error: Unknown map name %s!' % map_name
+
+ self.dataroot = dataroot
+ self.map_name = map_name
+
+ self.geometric_layers = ['polygon', 'line', 'node']
+
+ # These are the non-geometric layers which have polygons as the geometric descriptors.
+ self.non_geometric_polygon_layers = ['drivable_area', 'road_segment', 'road_block', 'lane', 'ped_crossing',
+ 'walkway', 'stop_line', 'carpark_area']
+
+ # We want to be able to search for lane connectors, but not render them.
+ self.lookup_polygon_layers = self.non_geometric_polygon_layers + ['lane_connector']
+
+ # These are the non-geometric layers which have line strings as the geometric descriptors.
+ self.non_geometric_line_layers = ['road_divider', 'lane_divider', 'traffic_light']
+ self.non_geometric_layers = self.non_geometric_polygon_layers + self.non_geometric_line_layers
+ self.layer_names = self.geometric_layers + self.lookup_polygon_layers + self.non_geometric_line_layers
+
+ # Load the selected map.
+ self.json_fname = os.path.join(self.dataroot, 'maps', 'expansion', '{}.json'.format(self.map_name))
+ with open(self.json_fname, 'r') as fh:
+ self.json_obj = json.load(fh)
+
+ # Parse the map version and print an error for deprecated maps.
+ if 'version' in self.json_obj:
+ self.version = self.json_obj['version']
+ else:
+ self.version = '1.0'
+ if self.version < '1.3':
+ raise Exception('Error: You are using an outdated map version (%s)! '
+ 'Please go to https://www.nuscenes.org/download to download the latest map!')
+
+ self.canvas_edge = self.json_obj['canvas_edge']
+ self._load_layers()
+ self._make_token2ind()
+ self._make_shortcuts()
+
+ self.explorer = NuScenesMapExplorer(self)
+
+ def _load_layer(self, layer_name: str) -> List[dict]:
+ """
+ Returns a list of records corresponding to the layer name.
+ :param layer_name: Name of the layer that will be loaded.
+ :return: A list of records corresponding to a layer.
+ """
+ return self.json_obj[layer_name]
+
+ def _load_layer_dict(self, layer_name: str) -> Dict[str, Union[dict, list]]:
+ """
+ Returns a dict of records corresponding to the layer name.
+ :param layer_name: Name of the layer that will be loaded.
+ :return: A dict of records corresponding to a layer.
+ """
+ return self.json_obj[layer_name]
+
+ def _load_layers(self) -> None:
+ """ Loads each available layer. """
+
+ # Explicit assignment of layers are necessary to help the IDE determine valid class members.
+ self.polygon = self._load_layer('polygon')
+ self.line = self._load_layer('line')
+ self.node = self._load_layer('node')
+ self.drivable_area = self._load_layer('drivable_area')
+ self.road_segment = self._load_layer('road_segment')
+ self.road_block = self._load_layer('road_block')
+ self.lane = self._load_layer('lane')
+ self.ped_crossing = self._load_layer('ped_crossing')
+ self.walkway = self._load_layer('walkway')
+ self.stop_line = self._load_layer('stop_line')
+ self.carpark_area = self._load_layer('carpark_area')
+ self.road_divider = self._load_layer('road_divider')
+ self.lane_divider = self._load_layer('lane_divider')
+ self.traffic_light = self._load_layer('traffic_light')
+
+ self.arcline_path_3: Dict[str, List[dict]] = self._load_layer_dict('arcline_path_3')
+ self.connectivity: Dict[str, dict] = self._load_layer_dict('connectivity')
+ self.lane_connector = self._load_layer('lane_connector')
+
+ def _make_token2ind(self) -> None:
+ """ Store the mapping from token to layer index for each layer. """
+ self._token2ind = dict()
+ for layer_name in self.layer_names:
+ self._token2ind[layer_name] = dict()
+
+ for ind, member in enumerate(getattr(self, layer_name)):
+ self._token2ind[layer_name][member['token']] = ind
+
+ def _make_shortcuts(self) -> None:
+ """ Makes the record shortcuts. """
+
+ # Makes a shortcut between non geometric records to their nodes.
+ for layer_name in self.non_geometric_polygon_layers:
+ if layer_name == 'drivable_area': # Drivable area has more than one geometric representation.
+ pass
+ else:
+ for record in self.__dict__[layer_name]:
+ polygon_obj = self.get('polygon', record['polygon_token'])
+ record['exterior_node_tokens'] = polygon_obj['exterior_node_tokens']
+ record['holes'] = polygon_obj['holes']
+
+ for layer_name in self.non_geometric_line_layers:
+ for record in self.__dict__[layer_name]:
+ record['node_tokens'] = self.get('line', record['line_token'])['node_tokens']
+
+ # Makes a shortcut between stop lines to their cues, there's different cues for different types of stop line.
+ # Refer to `_get_stop_line_cue()` for details.
+ for record in self.stop_line:
+ cue = self._get_stop_line_cue(record)
+ record['cue'] = cue
+
+ # Makes a shortcut between lanes to their lane divider segment nodes.
+ for record in self.lane:
+ record['left_lane_divider_segment_nodes'] = [self.get('node', segment['node_token']) for segment in
+ record['left_lane_divider_segments']]
+ record['right_lane_divider_segment_nodes'] = [self.get('node', segment['node_token']) for segment in
+ record['right_lane_divider_segments']]
+
+ def _get_stop_line_cue(self, stop_line_record: dict) -> List[dict]:
+ """
+ Get the different cues for different types of stop lines.
+ :param stop_line_record: A single stop line record.
+ :return: The cue for that stop line.
+ """
+ if stop_line_record['stop_line_type'] in ['PED_CROSSING', 'TURN_STOP']:
+ return [self.get('ped_crossing', token) for token in stop_line_record['ped_crossing_tokens']]
+ elif stop_line_record['stop_line_type'] in ['STOP_SIGN', 'YIELD']:
+ return []
+ elif stop_line_record['stop_line_type'] == 'TRAFFIC_LIGHT':
+ return [self.get('traffic_light', token) for token in stop_line_record['traffic_light_tokens']]
+
+ def get(self, layer_name: str, token: str) -> dict:
+ """
+ Returns a record from the layer in constant runtime.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record.
+ :return: A single layer record.
+ """
+ assert layer_name in self.layer_names, "Layer {} not found".format(layer_name)
+
+ return getattr(self, layer_name)[self.getind(layer_name, token)]
+
+ def getind(self, layer_name: str, token: str) -> int:
+ """
+ This returns the index of the record in a layer in constant runtime.
+ :param layer_name: Name of the layer we are interested in.
+ :param token: Token of the record.
+ :return: The index of the record in the layer, layer is an array.
+ """
+ return self._token2ind[layer_name][token]
+
+ def render_record(self,
+ layer_name: str,
+ token: str,
+ alpha: float = 0.5,
+ figsize: Tuple[float, float] = None,
+ other_layers: List[str] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Tuple[Axes, Axes]]:
+ """
+ Render a single map record. By default will also render 3 layers which are `drivable_area`, `lane`,
+ and `walkway` unless specified by `other_layers`.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record that you want to render.
+ :param alpha: The opacity of each layer that gets rendered.
+ :param figsize: Size of the whole figure.
+ :param other_layers: What other layers to render aside from the one specified in `layer_name`.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: The matplotlib figure and axes of the rendered layers.
+ """
+ return self.explorer.render_record(layer_name, token, alpha,
+ figsize=figsize, other_layers=other_layers, bitmap=bitmap)
+
+ def render_layers(self,
+ layer_names: List[str],
+ alpha: float = 0.5,
+ figsize: Union[None, float, Tuple[float, float]] = None,
+ tokens: List[str] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Render a list of layer names.
+ :param layer_names: A list of layer names.
+ :param alpha: The opacity of each layer that gets rendered.
+ :param figsize: Size of the whole figure.
+ :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: The matplotlib figure and axes of the rendered layers.
+ """
+ return self.explorer.render_layers(layer_names, alpha,
+ figsize=figsize, tokens=tokens, bitmap=bitmap)
+
+ def render_map_patch(self,
+ box_coords: Tuple[float, float, float, float],
+ layer_names: List[str] = None,
+ alpha: float = 0.5,
+ figsize: Tuple[int, int] = (15, 15),
+ render_egoposes_range: bool = True,
+ render_legend: bool = True,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Renders a rectangular patch specified by `box_coords`. By default renders all layers.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param layer_names: All the non geometric layers that we want to render.
+ :param alpha: The opacity of each layer.
+ :param figsize: Size of the whole figure.
+ :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+ :param render_legend: Whether to render the legend of map layers.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: The matplotlib figure and axes of the rendered layers.
+ """
+ return self.explorer.render_map_patch(box_coords, layer_names=layer_names, alpha=alpha, figsize=figsize,
+ render_egoposes_range=render_egoposes_range,
+ render_legend=render_legend, bitmap=bitmap)
+
+ def render_map_in_image(self,
+ nusc: NuScenes,
+ sample_token: str,
+ camera_channel: str = 'CAM_FRONT',
+ alpha: float = 0.3,
+ patch_radius: float = 10000,
+ min_polygon_area: float = 1000,
+ render_behind_cam: bool = True,
+ render_outside_im: bool = True,
+ layer_names: List[str] = None,
+ verbose: bool = True,
+ out_path: str = None) -> Tuple[Figure, Axes]:
+ """
+ Render a nuScenes camera image and overlay the polygons for the specified map layers.
+ Note that the projections are not always accurate as the localization is in 2d.
+ :param nusc: The NuScenes instance to load the image from.
+ :param sample_token: The image's corresponding sample_token.
+ :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+ :param alpha: The transparency value of the layers to render in [0, 1].
+ :param patch_radius: The radius in meters around the ego car in which to select map records.
+ :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+ :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+ :param render_outside_im: Whether to render polygons where any point is outside the image.
+ :param layer_names: The names of the layers to render, e.g. ['lane'].
+ If set to None, the recommended setting will be used.
+ :param verbose: Whether to print to stdout.
+ :param out_path: Optional path to save the rendered figure to disk.
+ """
+ return self.explorer.render_map_in_image(
+ nusc, sample_token, camera_channel=camera_channel, alpha=alpha,
+ patch_radius=patch_radius, min_polygon_area=min_polygon_area,
+ render_behind_cam=render_behind_cam, render_outside_im=render_outside_im,
+ layer_names=layer_names, verbose=verbose, out_path=out_path)
+
+ def get_map_mask_in_image(self,
+ nusc: NuScenes,
+ sample_token: str,
+ camera_channel: str = 'CAM_FRONT',
+ alpha: float = 0.3,
+ patch_radius: float = 10000,
+ min_polygon_area: float = 1000,
+ render_behind_cam: bool = True,
+ render_outside_im: bool = True,
+ layer_names: List[str] = None,
+ verbose: bool = False,
+ out_path: str = None):
+ """
+ Render a nuScenes camera image and overlay the polygons for the specified map layers.
+ Note that the projections are not always accurate as the localization is in 2d.
+ :param nusc: The NuScenes instance to load the image from.
+ :param sample_token: The image's corresponding sample_token.
+ :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+ :param alpha: The transparency value of the layers to render in [0, 1].
+ :param patch_radius: The radius in meters around the ego car in which to select map records.
+ :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+ :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+ :param render_outside_im: Whether to render polygons where any point is outside the image.
+ :param layer_names: The names of the layers to render, e.g. ['lane'].
+ If set to None, the recommended setting will be used.
+ :param verbose: Whether to print to stdout.
+ :param out_path: Optional path to save the rendered figure to disk.
+ """
+ return self.explorer.get_map_mask_in_image(
+ nusc, sample_token, camera_channel=camera_channel, alpha=alpha,
+ patch_radius=patch_radius, min_polygon_area=min_polygon_area,
+ render_behind_cam=render_behind_cam, render_outside_im=render_outside_im,
+ layer_names=layer_names, verbose=verbose, out_path=out_path)
+
+ def render_egoposes_on_fancy_map(self,
+ nusc: NuScenes,
+ scene_tokens: List = None,
+ verbose: bool = True,
+ out_path: str = None,
+ render_egoposes: bool = True,
+ render_egoposes_range: bool = True,
+ render_legend: bool = True,
+ bitmap: Optional[BitMap] = None) -> Tuple[np.ndarray, Figure, Axes]:
+ """
+ Renders each ego pose of a list of scenes on the map (around 40 poses per scene).
+ This method is heavily inspired by NuScenes.render_egoposes_on_map(), but uses the map expansion pack maps.
+ :param nusc: The NuScenes instance to load the ego poses from.
+ :param scene_tokens: Optional list of scene tokens corresponding to the current map location.
+ :param verbose: Whether to show status messages and progress bar.
+ :param out_path: Optional path to save the rendered figure to disk.
+ :param render_egoposes: Whether to render ego poses.
+ :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+ :param render_legend: Whether to render the legend of map layers.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: . Returns a matrix with n ego poses in global map coordinates.
+ """
+ return self.explorer.render_egoposes_on_fancy_map(nusc, scene_tokens=scene_tokens,
+ verbose=verbose, out_path=out_path,
+ render_egoposes=render_egoposes,
+ render_egoposes_range=render_egoposes_range,
+ render_legend=render_legend, bitmap=bitmap)
+
+ def render_centerlines(self,
+ resolution_meters: float = 0.5,
+ figsize: Union[None, float, Tuple[float, float]] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Render the centerlines of all lanes and lane connectors.
+ :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved
+ lanes are properly represented.
+ :param figsize: Size of the figure.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ """
+ return self.explorer.render_centerlines(resolution_meters=resolution_meters, figsize=figsize, bitmap=bitmap)
+
+ def render_map_mask(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_names: List[str] = None,
+ canvas_size: Tuple[int, int] = (100, 100),
+ figsize: Tuple[int, int] = (15, 15),
+ n_row: int = 2) -> Tuple[Figure, List[Axes]]:
+ """
+ Render map mask of the patch specified by patch_box and patch_angle.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ :param layer_names: A list of layer names to be returned.
+ :param canvas_size: Size of the output mask (h, w).
+ :param figsize: Size of the figure.
+ :param n_row: Number of rows with plots.
+ :return: The matplotlib figure and a list of axes of the rendered layers.
+ """
+ return self.explorer.render_map_mask(patch_box, patch_angle,
+ layer_names=layer_names, canvas_size=canvas_size,
+ figsize=figsize, n_row=n_row)
+
+ def get_map_mask(self,
+ patch_box: Optional[Tuple[float, float, float, float]],
+ patch_angle: float,
+ layer_names: List[str] = None,
+ canvas_size: Optional[Tuple[int, int]] = (100, 100)) -> np.ndarray:
+ """
+ Return list of map mask layers of the specified patch.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width]. If None, this plots the entire map.
+ :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0.
+ :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+ :param canvas_size: Size of the output mask (h, w). If None, we use the default resolution of 10px/m.
+ :return: Stacked numpy array of size [c x h x w] with c channels and the same width/height as the canvas.
+ """
+ return self.explorer.get_map_mask(patch_box, patch_angle, layer_names=layer_names, canvas_size=canvas_size)
+
+ def get_map_geom(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_names: List[str]) -> List[Tuple[str, List[Geometry]]]:
+ """
+ Returns a list of geometries in the specified patch_box.
+ These are unscaled, but aligned with the patch angle.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ North-facing corresponds to 0.
+ :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+ :return: List of layer names and their corresponding geometries.
+ """
+ return self.explorer.get_map_geom(patch_box, patch_angle, layer_names)
+
+ def get_records_in_patch(self,
+ box_coords: Tuple[float, float, float, float],
+ layer_names: List[str] = None,
+ mode: str = 'intersect') -> Dict[str, List[str]]:
+ """
+ Get all the record token that intersects or is within a particular rectangular patch.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param layer_names: Names of the layers that we want to retrieve in a particular patch. By default will always
+ look at the all non geometric layers.
+ :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return
+ all non geometric records that are within the patch.
+ :return: Dictionary of layer_name - tokens pairs.
+ """
+ return self.explorer.get_records_in_patch(box_coords, layer_names=layer_names, mode=mode)
+
+ def is_record_in_patch(self,
+ layer_name: str,
+ token: str,
+ box_coords: Tuple[float, float, float, float],
+ mode: str = 'intersect') -> bool:
+ """
+ Query whether a particular record is in a rectangular patch
+ :param layer_name: The layer name of the record.
+ :param token: The record token.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param mode: "intersect" means it will return True if the geometric object intersects the patch, "within" will
+ return True if the geometric object is within the patch.
+ :return: Boolean value on whether a particular record intersects or within a particular patch.
+ """
+ return self.explorer.is_record_in_patch(layer_name, token, box_coords, mode=mode)
+
+ def layers_on_point(self, x: float, y: float, layer_names: List[str] = None) -> Dict[str, str]:
+ """
+ Returns all the polygonal layers that a particular point is on.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :param layer_names: The names of the layers to search for.
+ :return: All the polygonal layers that a particular point is on. {: }
+ """
+ return self.explorer.layers_on_point(x, y, layer_names=layer_names)
+
+ def record_on_point(self, x: float, y: float, layer_name: str) -> str:
+ """
+ Query what record of a layer a particular point is on.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :param layer_name: The non geometric polygonal layer name that we are interested in.
+ :return: The first token of a layer a particular point is on or '' if no layer is found.
+ """
+ return self.explorer.record_on_point(x, y, layer_name)
+
+ def extract_polygon(self, polygon_token: str) -> Polygon:
+ """
+ Construct a shapely Polygon object out of a polygon token.
+ :param polygon_token: The token of the polygon record.
+ :return: The polygon wrapped in a shapely Polygon object.
+ """
+ return self.explorer.extract_polygon(polygon_token)
+
+ def extract_line(self, line_token: str) -> LineString:
+ """
+ Construct a shapely LineString object out of a line token.
+ :param line_token: The token of the line record.
+ :return: The line wrapped in a LineString object.
+ """
+ return self.explorer.extract_line(line_token)
+
+ def get_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+ """
+ Get the bounds of the geometric object that corresponds to a non geometric record.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record.
+ :return: min_x, min_y, max_x, max_y of of the line representation.
+ """
+ return self.explorer.get_bounds(layer_name, token)
+
+ def get_records_in_radius(self, x: float, y: float, radius: float,
+ layer_names: List[str], mode: str = 'intersect') -> Dict[str, List[str]]:
+ """
+ Get all the record tokens that intersect a square patch of side length 2*radius centered on (x,y).
+ :param x: X-coordinate in global frame.
+ :param y: y-coordinate in global frame.
+ :param radius: All records within radius meters of point (x, y) will be returned.
+ :param layer_names: Names of the layers that we want to retrieve. By default will always
+ look at the all non geometric layers.
+ :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return
+ all non geometric records that are within the patch.
+ :return: Dictionary of layer_name - tokens pairs.
+ """
+
+ patch = (x - radius, y - radius, x + radius, y + radius)
+ return self.explorer.get_records_in_patch(patch, layer_names, mode=mode)
+
+ def discretize_centerlines(self, resolution_meters: float) -> List[np.array]:
+ """
+ Discretize the centerlines of lanes and lane connectors.
+ :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved
+ lanes are properly represented.
+ :return: A list of np.arrays with x, y and z values for each point.
+ """
+ pose_lists = []
+ for lane in self.lane + self.lane_connector:
+ my_lane = self.arcline_path_3.get(lane['token'], [])
+ discretized = np.array(discretize_lane(my_lane, resolution_meters))
+ pose_lists.append(discretized)
+
+ return pose_lists
+
+ def discretize_lanes(self, tokens: List[str],
+ resolution_meters: float) -> Dict[str, List[Tuple[float, float, float]]]:
+ """
+ Discretizes a list of lane/lane connector tokens.
+ :param tokens: List of lane and/or lane connector record tokens. Can be retrieved with
+ get_records_in_radius or get_records_in_patch.
+ :param resolution_meters: How finely to discretize the splines.
+ :return: Mapping from lane/lane connector token to sequence of poses along the lane.
+ """
+
+ return {ID: discretize_lane(self.arcline_path_3.get(ID, []), resolution_meters) for ID in tokens}
+
+ def _get_connected_lanes(self, lane_token: str, incoming_outgoing: str) -> List[str]:
+ """
+ Helper for getting the lanes connected to a given lane
+ :param lane_token: Token for the lane.
+ :param incoming_outgoing: Whether to get incoming or outgoing lanes
+ :return: List of lane tokens this lane is connected to.
+ """
+
+ if lane_token not in self.connectivity:
+ raise ValueError(f"{lane_token} is not a valid lane.")
+
+ return self.connectivity[lane_token][incoming_outgoing]
+
+ def get_outgoing_lane_ids(self, lane_token: str) -> List[str]:
+ """
+ Get the out-going lanes.
+ :param lane_token: Token for the lane.
+ :return: List of lane tokens that start at the end of this lane.
+ """
+
+ return self._get_connected_lanes(lane_token, 'outgoing')
+
+ def get_incoming_lane_ids(self, lane_token: str) -> List[str]:
+ """
+ Get the incoming lanes.
+ :param lane_token: Token for the lane.
+ :return: List of lane tokens that end at the start of this lane.
+ """
+
+ return self._get_connected_lanes(lane_token, 'incoming')
+
+ def get_arcline_path(self, lane_token: str) -> List[ArcLinePath]:
+ """
+ Get the arcline path representation for a lane.
+ Note: This function was previously called `get_lane()`, but renamed to avoid confusion between lanes and
+ arcline paths.
+ :param lane_token: Token for the lane.
+ :return: Arc line path representation of the lane.
+ """
+
+ arcline_path = self.arcline_path_3.get(lane_token)
+ if not arcline_path:
+ raise ValueError(f'Error: Lane with token {lane_token} does not have a valid arcline path!')
+
+ return arcline_path
+
+ def get_closest_lane(self, x: float, y: float, radius: float = 5) -> str:
+ """
+ Get closest lane id within a radius of query point. The distance from a point (x, y) to a lane is
+ the minimum l2 distance from (x, y) to a point on the lane.
+ :param x: X coordinate in global coordinate frame.
+ :param y: Y Coordinate in global coordinate frame.
+ :param radius: Radius around point to consider.
+ :return: Lane id of closest lane within radius.
+ """
+
+ lanes = self.get_records_in_radius(x, y, radius, ['lane', 'lane_connector'])
+ lanes = lanes['lane'] + lanes['lane_connector']
+
+ discrete_points = self.discretize_lanes(lanes, 0.5)
+
+ current_min = np.inf
+
+ min_id = ""
+ for lane_id, points in discrete_points.items():
+
+ distance = np.linalg.norm(np.array(points)[:, :2] - [x, y], axis=1).min()
+ if distance <= current_min:
+ current_min = distance
+ min_id = lane_id
+
+ return min_id
+
+ def render_next_roads(self,
+ x: float,
+ y: float,
+ alpha: float = 0.5,
+ figsize: Union[None, float, Tuple[float, float]] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Renders the possible next roads from a point of interest.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :param alpha: The opacity of each layer that gets rendered.
+ :param figsize: Size of the whole figure.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ """
+ return self.explorer.render_next_roads(x, y, alpha, figsize=figsize, bitmap=bitmap)
+
+ def get_next_roads(self, x: float, y: float) -> Dict[str, List[str]]:
+ """
+ Get the possible next roads from a point of interest.
+ Returns road_segment, road_block and lane.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :return: Dictionary of layer_name - tokens pairs.
+ """
+ # Filter out irrelevant layers.
+ road_layers = ['road_segment', 'road_block', 'lane']
+ layers = self.explorer.layers_on_point(x, y)
+ rel_layers = {layer: layers[layer] for layer in road_layers}
+
+ # Pick most fine-grained road layer (lane, road_block, road_segment) object that contains the point.
+ rel_layer = None
+ rel_token = None
+ for layer in road_layers[::-1]:
+ if rel_layers[layer] != '':
+ rel_layer = layer
+ rel_token = rel_layers[layer]
+ break
+ assert rel_layer is not None, 'Error: No suitable layer in the specified point location!'
+
+ # Get all records that overlap with the bounding box of the selected road.
+ box_coords = self.explorer.get_bounds(rel_layer, rel_token)
+ intersect = self.explorer.get_records_in_patch(box_coords, road_layers, mode='intersect')
+
+ # Go through all objects within the bounding box.
+ result = {layer: [] for layer in road_layers}
+ if rel_layer == 'road_segment':
+ # For road segments, we do not have a direction.
+ # Return objects that have ANY exterior points in common with the relevant layer.
+ rel_exterior_nodes = self.get(rel_layer, rel_token)['exterior_node_tokens']
+ for layer in road_layers:
+ for token in intersect[layer]:
+ exterior_nodes = self.get(layer, token)['exterior_node_tokens']
+ if any(n in exterior_nodes for n in rel_exterior_nodes) \
+ and token != rel_layers[layer]:
+ result[layer].append(token)
+ else:
+ # For lanes and road blocks, the next road is indicated by the edge line.
+ # Return objects where ALL edge line nodes are included in the exterior nodes.
+ to_edge_line = self.get(rel_layer, rel_token)['to_edge_line_token']
+ to_edge_nodes = self.get('line', to_edge_line)['node_tokens']
+ for layer in road_layers:
+ for token in intersect[layer]:
+ exterior_nodes = self.get(layer, token)['exterior_node_tokens']
+ if all(n in exterior_nodes for n in to_edge_nodes) \
+ and token != rel_layers[layer]:
+ result[layer].append(token)
+ return result
+
+
+class NuScenesMapExplorer:
+ """ Helper class to explore the nuScenes map data. """
+ def __init__(self,
+ map_api: NuScenesMap,
+ representative_layers: Tuple[str] = ('drivable_area', 'lane', 'walkway'),
+ color_map: dict = None):
+ """
+ :param map_api: NuScenesMap database class.
+ :param representative_layers: These are the layers that we feel are representative of the whole mapping data.
+ :param color_map: Color map.
+ """
+ # Mutable default argument.
+ if color_map is None:
+ color_map = dict(drivable_area='#a6cee3',
+ road_segment='#1f78b4',
+ road_block='#b2df8a',
+ lane='#33a02c',
+ ped_crossing='#fb9a99',
+ walkway='#e31a1c',
+ stop_line='#fdbf6f',
+ carpark_area='#ff7f00',
+ road_divider='#cab2d6',
+ lane_divider='#6a3d9a',
+ traffic_light='#7e772e')
+
+ self.map_api = map_api
+ self.representative_layers = representative_layers
+ self.color_map = color_map
+
+ self.canvas_max_x = self.map_api.canvas_edge[0]
+ self.canvas_min_x = 0
+ self.canvas_max_y = self.map_api.canvas_edge[1]
+ self.canvas_min_y = 0
+ self.canvas_aspect_ratio = (self.canvas_max_x - self.canvas_min_x) / (self.canvas_max_y - self.canvas_min_y)
+
+ def render_centerlines(self,
+ resolution_meters: float,
+ figsize: Union[None, float, Tuple[float, float]] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Render the centerlines of all lanes and lane connectors.
+ :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved
+ lanes are properly represented.
+ :param figsize: Size of the figure.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ """
+ # Discretize all lanes and lane connectors.
+ pose_lists = self.map_api.discretize_centerlines(resolution_meters)
+
+ # Render connectivity lines.
+ fig = plt.figure(figsize=self._get_figsize(figsize))
+ ax = fig.add_axes([0, 0, 1, 1 / self.canvas_aspect_ratio])
+
+ if bitmap is not None:
+ bitmap.render(self.map_api.canvas_edge, ax)
+
+ for pose_list in pose_lists:
+ if len(pose_list) > 0:
+ plt.plot(pose_list[:, 0], pose_list[:, 1])
+
+ return fig, ax
+
+ def render_map_mask(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_names: List[str],
+ canvas_size: Tuple[int, int],
+ figsize: Tuple[int, int],
+ n_row: int = 2) -> Tuple[Figure, List[Axes]]:
+ """
+ Render map mask of the patch specified by patch_box and patch_angle.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ :param layer_names: A list of layer names to be extracted.
+ :param canvas_size: Size of the output mask (h, w).
+ :param figsize: Size of the figure.
+ :param n_row: Number of rows with plots.
+ :return: The matplotlib figure and a list of axes of the rendered layers.
+ """
+ if layer_names is None:
+ layer_names = self.map_api.non_geometric_layers
+
+ map_mask = self.get_map_mask(patch_box, patch_angle, layer_names, canvas_size)
+
+ # If no canvas_size is specified, retrieve the default from the output of get_map_mask.
+ if canvas_size is None:
+ canvas_size = map_mask.shape[1:]
+
+ fig = plt.figure(figsize=figsize)
+ ax = fig.add_axes([0, 0, 1, 1])
+ ax.set_xlim(0, canvas_size[1])
+ ax.set_ylim(0, canvas_size[0])
+
+ n_col = len(map_mask) // n_row
+ gs = gridspec.GridSpec(n_row, n_col)
+ gs.update(wspace=0.025, hspace=0.05)
+ for i in range(len(map_mask)):
+ r = i // n_col
+ c = i - r * n_col
+ subax = plt.subplot(gs[r, c])
+ subax.imshow(map_mask[i], origin='lower')
+ subax.text(canvas_size[0] * 0.5, canvas_size[1] * 1.1, layer_names[i])
+ subax.grid(False)
+
+ return fig, fig.axes
+
+ def get_map_geom(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_names: List[str]) -> List[Tuple[str, List[Geometry]]]:
+ """
+ Returns a list of geometries in the specified patch_box.
+ These are unscaled, but aligned with the patch angle.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ North-facing corresponds to 0.
+ :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+ :return: List of layer names and their corresponding geometries.
+ """
+ # If None, return all geometric layers.
+ if layer_names is None:
+ layer_names = self.map_api.non_geometric_layers
+
+ # Get each layer name and geometry and store them in a list.
+ map_geom = []
+ for layer_name in layer_names:
+ layer_geom = self._get_layer_geom(patch_box, patch_angle, layer_name)
+ if layer_geom is None:
+ continue
+ map_geom.append((layer_name, layer_geom))
+
+ return map_geom
+
+ def map_geom_to_mask(self,
+ map_geom: List[Tuple[str, List[Geometry]]],
+ local_box: Tuple[float, float, float, float],
+ canvas_size: Tuple[int, int]) -> np.ndarray:
+ """
+ Return list of map mask layers of the specified patch.
+ :param map_geom: List of layer names and their corresponding geometries.
+ :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+ x_center = y_center = 0.
+ :param canvas_size: Size of the output mask (h, w).
+ :return: Stacked numpy array of size [c x h x w] with c channels and the same height/width as the canvas.
+ """
+ # Get each layer mask and stack them into a numpy tensor.
+ map_mask = []
+ for layer_name, layer_geom in map_geom:
+ layer_mask = self._layer_geom_to_mask(layer_name, layer_geom, local_box, canvas_size)
+ if layer_mask is not None:
+ map_mask.append(layer_mask)
+
+ return np.array(map_mask)
+
+ def get_map_mask(self,
+ patch_box: Optional[Tuple[float, float, float, float]],
+ patch_angle: float,
+ layer_names: List[str] = None,
+ canvas_size: Tuple[int, int] = (100, 100)) -> np.ndarray:
+ """
+ Return list of map mask layers of the specified patch.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width]. If None, this plots the entire map.
+ :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0.
+ :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+ :param canvas_size: Size of the output mask (h, w). If None, we use the default resolution of 10px/m.
+ :return: Stacked numpy array of size [c x h x w] with c channels and the same width/height as the canvas.
+ """
+ # For some combination of parameters, we need to know the size of the current map.
+ if self.map_api.map_name == 'singapore-onenorth':
+ map_dims = [1585.6, 2025.0]
+ elif self.map_api.map_name == 'singapore-hollandvillage':
+ map_dims = [2808.3, 2922.9]
+ elif self.map_api.map_name == 'singapore-queenstown':
+ map_dims = [3228.6, 3687.1]
+ elif self.map_api.map_name == 'boston-seaport':
+ map_dims = [2979.5, 2118.1]
+ else:
+ raise Exception('Error: Invalid map!')
+
+ # If None, return the entire map.
+ if patch_box is None:
+ patch_box = [map_dims[0] / 2, map_dims[1] / 2, map_dims[1], map_dims[0]]
+
+ # If None, return all geometric layers.
+ if layer_names is None:
+ layer_names = self.map_api.non_geometric_layers
+
+ # If None, return the specified patch in the original scale of 10px/m.
+ if canvas_size is None:
+ map_scale = 10
+ canvas_size = np.array((patch_box[2], patch_box[3])) * map_scale
+ canvas_size = tuple(np.round(canvas_size).astype(np.int32))
+
+ # Get geometry of each layer.
+ map_geom = self.get_map_geom(patch_box, patch_angle, layer_names)
+
+ # Convert geometry of each layer into mask and stack them into a numpy tensor.
+ # Convert the patch box from global coordinates to local coordinates by setting the center to (0, 0).
+ local_box = (0.0, 0.0, patch_box[2], patch_box[3])
+ map_mask = self.map_geom_to_mask(map_geom, local_box, canvas_size)
+ assert np.all(map_mask.shape[1:] == canvas_size)
+
+ return map_mask
+
+ def render_record(self,
+ layer_name: str,
+ token: str,
+ alpha: float = 0.5,
+ figsize: Union[None, float, Tuple[float, float]] = None,
+ other_layers: List[str] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Tuple[Axes, Axes]]:
+ """
+ Render a single map record.
+ By default will also render 3 layers which are `drivable_area`, `lane`, and `walkway` unless specified by
+ `other_layers`.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record that you want to render.
+ :param alpha: The opacity of each layer that gets rendered.
+ :param figsize: Size of the whole figure.
+ :param other_layers: What other layers to render aside from the one specified in `layer_name`.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: The matplotlib figure and axes of the rendered layers.
+ """
+ if other_layers is None:
+ other_layers = list(self.representative_layers)
+
+ for other_layer in other_layers:
+ if other_layer not in self.map_api.non_geometric_layers:
+ raise ValueError("{} is not a non geometric layer".format(layer_name))
+
+ x1, y1, x2, y2 = self.map_api.get_bounds(layer_name, token)
+
+ local_width = x2 - x1
+ local_height = y2 - y1
+ assert local_height > 0, 'Error: Map has 0 height!'
+ local_aspect_ratio = local_width / local_height
+
+ # We obtained the values 0.65 and 0.66 by trials.
+ fig = plt.figure(figsize=self._get_figsize(figsize))
+ global_ax = fig.add_axes([0, 0, 0.65, 0.65 / self.canvas_aspect_ratio])
+ local_ax = fig.add_axes([0.66, 0.66 / self.canvas_aspect_ratio, 0.34, 0.34 / local_aspect_ratio])
+
+ # To make sure the sequence of the layer overlays is always consistent after typesetting set().
+ random.seed('nutonomy')
+
+ if bitmap is not None:
+ bitmap.render(self.map_api.canvas_edge, global_ax)
+ bitmap.render(self.map_api.canvas_edge, local_ax)
+
+ layer_names = other_layers + [layer_name]
+ layer_names = list(set(layer_names))
+
+ for layer in layer_names:
+ self._render_layer(global_ax, layer, alpha)
+
+ for layer in layer_names:
+ self._render_layer(local_ax, layer, alpha)
+
+ if layer_name == 'drivable_area':
+ # Bad output aesthetically if we add spacing between the objects and the axes for drivable area.
+ local_ax_xlim = (x1, x2)
+ local_ax_ylim = (y1, y2)
+ else:
+ # Add some spacing between the object and the axes.
+ local_ax_xlim = (x1 - local_width / 3, x2 + local_width / 3)
+ local_ax_ylim = (y1 - local_height / 3, y2 + local_height / 3)
+
+ # Draws the rectangular patch on the local_ax.
+ local_ax.add_patch(Rectangle((x1, y1), local_width, local_height, linestyle='-.', color='red', fill=False,
+ lw=2))
+
+ local_ax.set_xlim(*local_ax_xlim)
+ local_ax.set_ylim(*local_ax_ylim)
+ local_ax.set_title('Local View')
+
+ global_ax.set_xlim(self.canvas_min_x, self.canvas_max_x)
+ global_ax.set_ylim(self.canvas_min_y, self.canvas_max_y)
+ global_ax.set_title('Global View')
+ global_ax.legend()
+
+ # Adds the zoomed in effect to the plot.
+ mark_inset(global_ax, local_ax, loc1=2, loc2=4)
+
+ return fig, (global_ax, local_ax)
+
+ def render_layers(self,
+ layer_names: List[str],
+ alpha: float,
+ figsize: Union[None, float, Tuple[float, float]],
+ tokens: List[str] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Render a list of layers.
+ :param layer_names: A list of layer names.
+ :param alpha: The opacity of each layer.
+ :param figsize: Size of the whole figure.
+ :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: The matplotlib figure and axes of the rendered layers.
+ """
+ fig = plt.figure(figsize=self._get_figsize(figsize))
+ ax = fig.add_axes([0, 0, 1, 1 / self.canvas_aspect_ratio])
+
+ ax.set_xlim(self.canvas_min_x, self.canvas_max_x)
+ ax.set_ylim(self.canvas_min_y, self.canvas_max_y)
+
+ if bitmap is not None:
+ bitmap.render(self.map_api.canvas_edge, ax)
+
+ layer_names = list(set(layer_names))
+ for layer_name in layer_names:
+ self._render_layer(ax, layer_name, alpha, tokens)
+
+ ax.legend()
+
+ return fig, ax
+
+ def render_map_patch(self,
+ box_coords: Tuple[float, float, float, float],
+ layer_names: List[str] = None,
+ alpha: float = 0.5,
+ figsize: Tuple[float, float] = (15, 15),
+ render_egoposes_range: bool = True,
+ render_legend: bool = True,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Renders a rectangular patch specified by `box_coords`. By default renders all layers.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param layer_names: All the non geometric layers that we want to render.
+ :param alpha: The opacity of each layer.
+ :param figsize: Size of the whole figure.
+ :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+ :param render_legend: Whether to render the legend of map layers.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: The matplotlib figure and axes of the rendered layers.
+ """
+ x_min, y_min, x_max, y_max = box_coords
+
+ if layer_names is None:
+ layer_names = self.map_api.non_geometric_layers
+
+ fig = plt.figure(figsize=figsize)
+
+ local_width = x_max - x_min
+ local_height = y_max - y_min
+ assert local_height > 0, 'Error: Map patch has 0 height!'
+ local_aspect_ratio = local_width / local_height
+
+ ax = fig.add_axes([0, 0, 1, 1 / local_aspect_ratio])
+
+ if bitmap is not None:
+ bitmap.render(self.map_api.canvas_edge, ax)
+
+ for layer_name in layer_names:
+ self._render_layer(ax, layer_name, alpha)
+
+ x_margin = np.minimum(local_width / 4, 50)
+ y_margin = np.minimum(local_height / 4, 10)
+ ax.set_xlim(x_min - x_margin, x_max + x_margin)
+ ax.set_ylim(y_min - y_margin, y_max + y_margin)
+
+ if render_egoposes_range:
+ ax.add_patch(Rectangle((x_min, y_min), local_width, local_height, fill=False, linestyle='-.', color='red',
+ lw=2))
+ ax.text(x_min + local_width / 100, y_min + local_height / 2, "%g m" % local_height,
+ fontsize=14, weight='bold')
+ ax.text(x_min + local_width / 2, y_min + local_height / 100, "%g m" % local_width,
+ fontsize=14, weight='bold')
+
+ if render_legend:
+ ax.legend(frameon=True, loc='upper right')
+
+ return fig, ax
+
+ def render_map_in_image(self,
+ nusc: NuScenes,
+ sample_token: str,
+ camera_channel: str = 'CAM_FRONT',
+ alpha: float = 0.3,
+ patch_radius: float = 10000,
+ min_polygon_area: float = 1000,
+ render_behind_cam: bool = True,
+ render_outside_im: bool = True,
+ layer_names: List[str] = None,
+ verbose: bool = True,
+ out_path: str = None) -> Tuple[Figure, Axes]:
+ """
+ Render a nuScenes camera image and overlay the polygons for the specified map layers.
+ Note that the projections are not always accurate as the localization is in 2d.
+ :param nusc: The NuScenes instance to load the image from.
+ :param sample_token: The image's corresponding sample_token.
+ :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+ :param alpha: The transparency value of the layers to render in [0, 1].
+ :param patch_radius: The radius in meters around the ego car in which to select map records.
+ :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+ :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+ :param render_outside_im: Whether to render polygons where any point is outside the image.
+ :param layer_names: The names of the layers to render, e.g. ['lane'].
+ If set to None, the recommended setting will be used.
+ :param verbose: Whether to print to stdout.
+ :param out_path: Optional path to save the rendered figure to disk.
+ """
+ near_plane = 1e-8
+
+ if verbose:
+ print('Warning: Note that the projections are not always accurate as the localization is in 2d.')
+
+ # Default layers.
+ if layer_names is None:
+ layer_names = ['road_segment', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area']
+
+ # Check layers whether we can render them.
+ for layer_name in layer_names:
+ assert layer_name in self.map_api.non_geometric_polygon_layers, \
+ 'Error: Can only render non-geometry polygons: %s' % layer_names
+
+ # Check that NuScenesMap was loaded for the correct location.
+ sample_record = nusc.get('sample', sample_token)
+ scene_record = nusc.get('scene', sample_record['scene_token'])
+ log_record = nusc.get('log', scene_record['log_token'])
+ log_location = log_record['location']
+ assert self.map_api.map_name == log_location, \
+ 'Error: NuScenesMap loaded for location %s, should be %s!' % (self.map_api.map_name, log_location)
+
+ # Grab the front camera image and intrinsics.
+ cam_token = sample_record['data'][camera_channel]
+ cam_record = nusc.get('sample_data', cam_token)
+ cam_path = nusc.get_sample_data_path(cam_token)
+ im = Image.open(cam_path)
+ im_size = im.size
+ cs_record = nusc.get('calibrated_sensor', cam_record['calibrated_sensor_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+
+ # Retrieve the current map.
+ poserecord = nusc.get('ego_pose', cam_record['ego_pose_token'])
+ ego_pose = poserecord['translation']
+ box_coords = (
+ ego_pose[0] - patch_radius,
+ ego_pose[1] - patch_radius,
+ ego_pose[0] + patch_radius,
+ ego_pose[1] + patch_radius,
+ )
+ records_in_patch = self.get_records_in_patch(box_coords, layer_names, 'intersect')
+
+ # Init axes.
+ fig = plt.figure(figsize=(9, 16))
+ ax = fig.add_axes([0, 0, 1, 1])
+ ax.set_xlim(0, im_size[0])
+ ax.set_ylim(0, im_size[1])
+ ax.imshow(im)
+
+ # Retrieve and render each record.
+ for layer_name in layer_names:
+ for token in records_in_patch[layer_name]:
+ record = self.map_api.get(layer_name, token)
+ if layer_name == 'drivable_area':
+ polygon_tokens = record['polygon_tokens']
+ else:
+ polygon_tokens = [record['polygon_token']]
+
+ for polygon_token in polygon_tokens:
+ polygon = self.map_api.extract_polygon(polygon_token)
+
+ # Convert polygon nodes to pointcloud with 0 height.
+ points = np.array(polygon.exterior.xy)
+ points = np.vstack((points, np.zeros((1, points.shape[1]))))
+
+ # Transform into the ego vehicle frame for the timestamp of the image.
+ points = points - np.array(poserecord['translation']).reshape((-1, 1))
+ points = np.dot(Quaternion(poserecord['rotation']).rotation_matrix.T, points)
+
+ # Transform into the camera.
+ points = points - np.array(cs_record['translation']).reshape((-1, 1))
+ points = np.dot(Quaternion(cs_record['rotation']).rotation_matrix.T, points)
+
+ # Remove points that are partially behind the camera.
+ depths = points[2, :]
+ behind = depths < near_plane
+ if np.all(behind):
+ continue
+
+ if render_behind_cam:
+ # Perform clipping on polygons that are partially behind the camera.
+ points = NuScenesMapExplorer._clip_points_behind_camera(points, near_plane)
+ elif np.any(behind):
+ # Otherwise ignore any polygon that is partially behind the camera.
+ continue
+
+ # Ignore polygons with less than 3 points after clipping.
+ if len(points) == 0 or points.shape[1] < 3:
+ continue
+
+ # Take the actual picture (matrix multiplication with camera-matrix + renormalization).
+ points = view_points(points, cam_intrinsic, normalize=True)
+
+ # Skip polygons where all points are outside the image.
+ # Leave a margin of 1 pixel for aesthetic reasons.
+ inside = np.ones(points.shape[1], dtype=bool)
+ inside = np.logical_and(inside, points[0, :] > 1)
+ inside = np.logical_and(inside, points[0, :] < im.size[0] - 1)
+ inside = np.logical_and(inside, points[1, :] > 1)
+ inside = np.logical_and(inside, points[1, :] < im.size[1] - 1)
+ if render_outside_im:
+ if np.all(np.logical_not(inside)):
+ continue
+ else:
+ if np.any(np.logical_not(inside)):
+ continue
+
+ points = points[:2, :]
+ points = [(p0, p1) for (p0, p1) in zip(points[0], points[1])]
+ polygon_proj = Polygon(points)
+
+ # Filter small polygons
+ if polygon_proj.area < min_polygon_area:
+ continue
+
+ label = layer_name
+ ax.add_patch(descartes.PolygonPatch(polygon_proj, fc=self.color_map[layer_name], alpha=alpha,
+ label=label))
+
+ # Display the image.
+ plt.axis('off')
+ ax.invert_yaxis()
+
+ if out_path is not None:
+ plt.tight_layout()
+ plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
+
+ return fig, ax
+
+ @staticmethod
+ def points_transform(points, poserecord, cs_record, cam_intrinsic, im_size, near_plane=1e-8,
+ render_behind_cam=True, render_outside_im=True):
+ points = np.vstack((points, np.zeros((1, points.shape[1]))))
+
+ # Transform into the ego vehicle frame for the timestamp of the image.
+ points = points - np.array(poserecord['translation']).reshape((-1, 1))
+ points = np.dot(Quaternion(poserecord['rotation']).rotation_matrix.T, points)
+
+ # Transform into the camera.
+ points = points - np.array(cs_record['translation']).reshape((-1, 1))
+ points = np.dot(Quaternion(cs_record['rotation']).rotation_matrix.T, points)
+
+ # Remove points that are partially behind the camera.
+ depths = points[2, :]
+ behind = depths < near_plane
+ if np.all(behind):
+ return None
+
+ if render_behind_cam:
+ # Perform clipping on polygons that are partially behind the camera.
+ points = NuScenesMapExplorer._clip_points_behind_camera(points, near_plane)
+
+ elif np.any(behind):
+ # Otherwise ignore any polygon that is partially behind the camera.
+ return None
+
+ # Take the actual picture (matrix multiplication with camera-matrix + renormalization).
+ points = view_points(points, cam_intrinsic, normalize=True)
+
+ # Skip polygons where all points are outside the image.
+ # Leave a margin of 1 pixel for aesthetic reasons.
+ inside = np.ones(points.shape[1], dtype=bool)
+ inside = np.logical_and(inside, points[0, :] > 1)
+ inside = np.logical_and(inside, points[0, :] < im_size[0] - 1)
+ inside = np.logical_and(inside, points[1, :] > 1)
+ inside = np.logical_and(inside, points[1, :] < im_size[1] - 1)
+
+ if render_outside_im:
+ if np.all(np.logical_not(inside)):
+ return None
+ else:
+ if np.any(np.logical_not(inside)):
+ return None
+
+ # points = points[:, inside]
+
+ # Ignore polygons with less than 3 points after clipping.
+ if len(points) == 0 or points.shape[1] < 3:
+ return None
+
+ points = points[:2, :]
+ points = [(p0, p1) for (p0, p1) in zip(points[0], points[1])]
+ return points
+
+ def get_map_mask_in_image(self,
+ nusc: NuScenes,
+ sample_token: str,
+ camera_channel: str = 'CAM_FRONT',
+ alpha: float = 0.3,
+ patch_radius: float = 10000,
+ min_polygon_area: float = 1000,
+ render_behind_cam: bool = True,
+ render_outside_im: bool = True,
+ layer_names: List[str] = None,
+ verbose: bool = False,
+ out_path: str = None) -> np.ndarray:
+ """
+ Render a nuScenes camera image and overlay the polygons for the specified map layers.
+ Note that the projections are not always accurate as the localization is in 2d.
+ :param nusc: The NuScenes instance to load the image from.
+ :param sample_token: The image's corresponding sample_token.
+ :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+ :param alpha: The transparency value of the layers to render in [0, 1].
+ :param patch_radius: The radius in meters around the ego car in which to select map records.
+ :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+ :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+ :param render_outside_im: Whether to render polygons where any point is outside the image.
+ :param layer_names: The names of the layers to render, e.g. ['lane'].
+ If set to None, the recommended setting will be used.
+ :param verbose: Whether to print to stdout.
+ :param out_path: Optional path to save the rendered figure to disk.
+ """
+ near_plane = 1e-8
+ if verbose:
+ print('Warning: Note that the projections are not always accurate as the localization is in 2d.')
+
+ # Default layers.
+ if layer_names is None:
+ layer_names = ['road_segment', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area']
+
+ # # Check layers whether we can render them.
+ # for layer_name in layer_names:
+ # assert layer_name in self.map_api.non_geometric_polygon_layers, \
+ # 'Error: Can only render non-geometry polygons: %s' % layer_names
+
+ # Check that NuScenesMap was loaded for the correct location.
+ sample_record = nusc.get('sample', sample_token)
+ scene_record = nusc.get('scene', sample_record['scene_token'])
+ log_record = nusc.get('log', scene_record['log_token'])
+ log_location = log_record['location']
+ assert self.map_api.map_name == log_location, \
+ 'Error: NuScenesMap loaded for location %s, should be %s!' % (self.map_api.map_name, log_location)
+
+ # Grab the front camera image and intrinsics.
+ cam_token = sample_record['data'][camera_channel]
+ cam_record = nusc.get('sample_data', cam_token)
+ cam_path = nusc.get_sample_data_path(cam_token)
+ im = Image.open(cam_path)
+ im_size = im.size
+ cs_record = nusc.get('calibrated_sensor', cam_record['calibrated_sensor_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+
+ # Retrieve the current map.
+ poserecord = nusc.get('ego_pose', cam_record['ego_pose_token'])
+ ego_pose = poserecord['translation']
+ box_coords = (
+ ego_pose[0] - patch_radius,
+ ego_pose[1] - patch_radius,
+ ego_pose[0] + patch_radius,
+ ego_pose[1] + patch_radius,
+ )
+ records_in_patch = self.get_records_in_patch(box_coords, layer_names, 'intersect')
+
+ if out_path is not None:
+ # Init axes.
+ fig = plt.figure(figsize=(9, 16))
+ ax = fig.add_axes([0, 0, 1, 1])
+ ax.set_xlim(0, im_size[0])
+ ax.set_ylim(0, im_size[1])
+ ax.imshow(im)
+
+ points_transform = partial(self.points_transform, poserecord=poserecord, cs_record=cs_record,
+ cam_intrinsic=cam_intrinsic, near_plane=near_plane, im_size=im_size,
+ render_behind_cam=render_behind_cam, render_outside_im=render_outside_im)
+
+ # Retrieve and render each record.
+ map_geom = []
+ for layer_name in layer_names:
+ if layer_name in self.map_api.non_geometric_line_layers:
+ line_list = []
+ for token in records_in_patch[layer_name]:
+ record = self.map_api.get(layer_name, token)
+ line = self.map_api.extract_line(record['line_token'])
+ if line.is_empty: # Skip lines without nodes.
+ continue
+ points = np.array(line.xy)
+ points = points_transform(points)
+ if points is None:
+ continue
+ line = LineString(points)
+ line_list.append(line)
+ # For visualize
+ if out_path is not None:
+ polygon = Polygon(points)
+ ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name],
+ alpha=alpha, label=layer_name))
+ map_geom.append((layer_name, line_list))
+ elif layer_name == 'drivable_area':
+ polygon_list = []
+ for token in records_in_patch[layer_name]:
+ record = self.map_api.get(layer_name, token)
+ polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in
+ record['polygon_tokens']]
+ for polygon in polygons:
+ ex_points = np.array(polygon.exterior.xy)
+ ex_points = points_transform(ex_points)
+ if ex_points is None:
+ continue
+ interiors = []
+ for interior in polygon.interiors:
+ in_points = np.array(interior.xy)
+ in_points = points_transform(in_points)
+ if in_points is None:
+ continue
+ interiors.append(in_points)
+ polygon = Polygon(ex_points, interiors)
+ polygon = polygon.buffer(0.01)
+ if polygon.geom_type == 'Polygon':
+ polygon = MultiPolygon([polygon])
+ # Filter small polygons
+ if polygon.area < min_polygon_area:
+ continue
+ polygon_list.append(polygon)
+ # For visualize
+ if out_path is not None:
+ ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name],
+ alpha=alpha, label=layer_name))
+ map_geom.append((layer_name, polygon_list))
+ else:
+ polygon_list = []
+ for token in records_in_patch[layer_name]:
+ record = self.map_api.get(layer_name, token)
+ polygon = self.map_api.extract_polygon(record['polygon_token'])
+ if polygon.is_valid:
+ if not polygon.is_empty:
+ ex_points = np.array(polygon.exterior.xy)
+ ex_points = points_transform(ex_points)
+ if ex_points is None:
+ continue
+ interiors = []
+ for interior in polygon.interiors:
+ in_points = np.array(interior.xy)
+ in_points = points_transform(in_points)
+ if in_points is None:
+ continue
+ interiors.append(in_points)
+ polygon = Polygon(ex_points, interiors)
+ polygon = polygon.buffer(0.01)
+ if polygon.geom_type == 'Polygon':
+ polygon = MultiPolygon([polygon])
+ # Filter small polygons
+ if polygon.area < min_polygon_area:
+ continue
+ polygon_list.append(polygon)
+ # For visualize
+ if out_path is not None:
+ ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name],
+ alpha=alpha, label=layer_name))
+ map_geom.append((layer_name, polygon_list))
+
+ # For visualize
+ if out_path is not None:
+ # Display the image.
+ plt.axis('off')
+ ax.invert_yaxis()
+ plt.tight_layout()
+ plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
+ plt.close()
+
+ # Convert geometry of each layer into mask and stack them into a numpy tensor.
+ # Convert the patch box from global coordinates to local coordinates by setting the center to (0, 0).
+ local_box = (im_size[0] // 2, im_size[1] // 2, im_size[1], im_size[0])
+ canvas_size = (im_size[1], im_size[0])
+ img_mask = self.map_geom_to_mask(map_geom, local_box, canvas_size)
+ assert np.all(img_mask.shape[1:] == canvas_size)
+ return img_mask
+
+ def render_egoposes_on_fancy_map(self,
+ nusc: NuScenes,
+ scene_tokens: List = None,
+ verbose: bool = True,
+ out_path: str = None,
+ render_egoposes: bool = True,
+ render_egoposes_range: bool = True,
+ render_legend: bool = True,
+ bitmap: Optional[BitMap] = None) -> Tuple[np.ndarray, Figure, Axes]:
+ """
+ Renders each ego pose of a list of scenes on the map (around 40 poses per scene).
+ This method is heavily inspired by NuScenes.render_egoposes_on_map(), but uses the map expansion pack maps.
+ Note that the maps are constantly evolving, whereas we only released a single snapshot of the data.
+ Therefore for some scenes there is a bad fit between ego poses and maps.
+ :param nusc: The NuScenes instance to load the ego poses from.
+ :param scene_tokens: Optional list of scene tokens corresponding to the current map location.
+ :param verbose: Whether to show status messages and progress bar.
+ :param out_path: Optional path to save the rendered figure to disk.
+ :param render_egoposes: Whether to render ego poses.
+ :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+ :param render_legend: Whether to render the legend of map layers.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ :return: . Returns a matrix with n ego poses in global map coordinates.
+ """
+ # Settings
+ patch_margin = 2
+ min_diff_patch = 30
+
+ # Ids of scenes with a bad match between localization and map.
+ scene_blacklist = [499, 515, 517]
+
+ # Get logs by location.
+ log_location = self.map_api.map_name
+ log_tokens = [log['token'] for log in nusc.log if log['location'] == log_location]
+ assert len(log_tokens) > 0, 'Error: This split has 0 scenes for location %s!' % log_location
+
+ # Filter scenes.
+ scene_tokens_location = [e['token'] for e in nusc.scene if e['log_token'] in log_tokens]
+ if scene_tokens is not None:
+ scene_tokens_location = [t for t in scene_tokens_location if t in scene_tokens]
+ assert len(scene_tokens_location) > 0, 'Error: Found 0 valid scenes for location %s!' % log_location
+
+ map_poses = []
+ if verbose:
+ print('Adding ego poses to map...')
+ for scene_token in tqdm(scene_tokens_location, disable=not verbose):
+ # Check that the scene is from the correct location.
+ scene_record = nusc.get('scene', scene_token)
+ scene_name = scene_record['name']
+ scene_id = int(scene_name.replace('scene-', ''))
+ log_record = nusc.get('log', scene_record['log_token'])
+ assert log_record['location'] == log_location, \
+ 'Error: The provided scene_tokens do not correspond to the provided map location!'
+
+ # Print a warning if the localization is known to be bad.
+ if verbose and scene_id in scene_blacklist:
+ print('Warning: %s is known to have a bad fit between ego pose and map.' % scene_name)
+
+ # For each sample in the scene, store the ego pose.
+ sample_tokens = nusc.field2token('sample', 'scene_token', scene_token)
+ for sample_token in sample_tokens:
+ sample_record = nusc.get('sample', sample_token)
+
+ # Poses are associated with the sample_data. Here we use the lidar sample_data.
+ sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+ pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+
+ # Calculate the pose on the map and append.
+ map_poses.append(pose_record['translation'])
+
+ # Check that ego poses aren't empty.
+ assert len(map_poses) > 0, 'Error: Found 0 ego poses. Please check the inputs.'
+
+ # Compute number of close ego poses.
+ if verbose:
+ print('Creating plot...')
+ map_poses = np.vstack(map_poses)[:, :2]
+
+ # Render the map patch with the current ego poses.
+ min_patch = np.floor(map_poses.min(axis=0) - patch_margin)
+ max_patch = np.ceil(map_poses.max(axis=0) + patch_margin)
+ diff_patch = max_patch - min_patch
+ if any(diff_patch < min_diff_patch):
+ center_patch = (min_patch + max_patch) / 2
+ diff_patch = np.maximum(diff_patch, min_diff_patch)
+ min_patch = center_patch - diff_patch / 2
+ max_patch = center_patch + diff_patch / 2
+ my_patch = (min_patch[0], min_patch[1], max_patch[0], max_patch[1])
+ fig, ax = self.render_map_patch(my_patch, self.map_api.non_geometric_layers, figsize=(10, 10),
+ render_egoposes_range=render_egoposes_range,
+ render_legend=render_legend, bitmap=bitmap)
+
+ # Plot in the same axis as the map.
+ # Make sure these are plotted "on top".
+ if render_egoposes:
+ ax.scatter(map_poses[:, 0], map_poses[:, 1], s=20, c='k', alpha=1.0, zorder=2)
+ plt.axis('off')
+
+ if out_path is not None:
+ plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
+
+ return map_poses, fig, ax
+
+ def render_next_roads(self,
+ x: float,
+ y: float,
+ alpha: float = 0.5,
+ figsize: Union[None, float, Tuple[float, float]] = None,
+ bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+ """
+ Renders the possible next roads from a point of interest.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :param alpha: The opacity of each layer that gets rendered.
+ :param figsize: Size of the whole figure.
+ :param bitmap: Optional BitMap object to render below the other map layers.
+ """
+ # Get next roads.
+ next_roads = self.map_api.get_next_roads(x, y)
+ layer_names = []
+ tokens = []
+ for layer_name, layer_tokens in next_roads.items():
+ if len(layer_tokens) > 0:
+ layer_names.append(layer_name)
+ tokens.extend(layer_tokens)
+
+ # Render them.
+ fig, ax = self.render_layers(layer_names, alpha, figsize, tokens=tokens, bitmap=bitmap)
+
+ # Render current location with an x.
+ ax.plot(x, y, 'x', markersize=12, color='red')
+
+ return fig, ax
+
+ @staticmethod
+ def _clip_points_behind_camera(points, near_plane: float):
+ """
+ Perform clipping on polygons that are partially behind the camera.
+ This method is necessary as the projection does not work for points behind the camera.
+ Hence we compute the line between the point and the camera and follow that line until we hit the near plane of
+ the camera. Then we use that point.
+ :param points: Matrix of points, where each point (x, y, z) is along each column.
+ :param near_plane: If we set the near_plane distance of the camera to 0 then some points will project to
+ infinity. Therefore we need to clip these points at the near plane.
+ :return: The clipped version of the polygon. This may have fewer points than the original polygon if some lines
+ were entirely behind the polygon.
+ """
+ points_clipped = []
+ # Loop through each line on the polygon.
+ # For each line where exactly 1 endpoints is behind the camera, move the point along the line until
+ # it hits the near plane of the camera (clipping).
+ assert points.shape[0] == 3
+ point_count = points.shape[1]
+ for line_1 in range(point_count):
+ line_2 = (line_1 + 1) % point_count
+ point_1 = points[:, line_1]
+ point_2 = points[:, line_2]
+ z_1 = point_1[2]
+ z_2 = point_2[2]
+
+ if z_1 >= near_plane and z_2 >= near_plane:
+ # Both points are in front.
+ # Add both points unless the first is already added.
+ if len(points_clipped) == 0 or all(points_clipped[-1] != point_1):
+ points_clipped.append(point_1)
+ points_clipped.append(point_2)
+ elif z_1 < near_plane and z_2 < near_plane:
+ # Both points are in behind.
+ # Don't add anything.
+ continue
+ else:
+ # One point is in front, one behind.
+ # By convention pointA is behind the camera and pointB in front.
+ if z_1 <= z_2:
+ point_a = points[:, line_1]
+ point_b = points[:, line_2]
+ else:
+ point_a = points[:, line_2]
+ point_b = points[:, line_1]
+ z_a = point_a[2]
+ z_b = point_b[2]
+
+ # Clip line along near plane.
+ pointdiff = point_b - point_a
+ alpha = (near_plane - z_b) / (z_a - z_b)
+ clipped = point_a + (1 - alpha) * pointdiff
+ assert np.abs(clipped[2] - near_plane) < 1e-6
+
+ # Add the first point (if valid and not duplicate), the clipped point and the second point (if valid).
+ if z_1 >= near_plane and (len(points_clipped) == 0 or all(points_clipped[-1] != point_1)):
+ points_clipped.append(point_1)
+ points_clipped.append(clipped)
+ if z_2 >= near_plane:
+ points_clipped.append(point_2)
+
+ points_clipped = np.array(points_clipped).transpose()
+ return points_clipped
+
+ def get_records_in_patch(self,
+ box_coords: Tuple[float, float, float, float],
+ layer_names: List[str] = None,
+ mode: str = 'intersect') -> Dict[str, List[str]]:
+ """
+ Get all the record token that intersects or within a particular rectangular patch.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param layer_names: Names of the layers that we want to retrieve in a particular patch.
+ By default will always look for all non geometric layers.
+ :param mode: "intersect" will return all non geometric records that intersects the patch,
+ "within" will return all non geometric records that are within the patch.
+ :return: Dictionary of layer_name - tokens pairs.
+ """
+ if mode not in ['intersect', 'within']:
+ raise ValueError("Mode {} is not valid, choice=('intersect', 'within')".format(mode))
+
+ if layer_names is None:
+ layer_names = self.map_api.non_geometric_layers
+
+ records_in_patch = dict()
+ for layer_name in layer_names:
+ layer_records = []
+ for record in getattr(self.map_api, layer_name):
+ token = record['token']
+ if self.is_record_in_patch(layer_name, token, box_coords, mode):
+ layer_records.append(token)
+
+ records_in_patch.update({layer_name: layer_records})
+
+ return records_in_patch
+
+ def is_record_in_patch(self,
+ layer_name: str,
+ token: str,
+ box_coords: Tuple[float, float, float, float],
+ mode: str = 'intersect') -> bool:
+ """
+ Query whether a particular record is in a rectangular patch.
+ :param layer_name: The layer name of the record.
+ :param token: The record token.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param mode: "intersect" means it will return True if the geometric object intersects the patch and False
+ otherwise, "within" will return True if the geometric object is within the patch and False otherwise.
+ :return: Boolean value on whether a particular record intersects or is within a particular patch.
+ """
+ if mode not in ['intersect', 'within']:
+ raise ValueError("Mode {} is not valid, choice=('intersect', 'within')".format(mode))
+
+ if layer_name in self.map_api.lookup_polygon_layers:
+ return self._is_polygon_record_in_patch(token, layer_name, box_coords, mode)
+ elif layer_name in self.map_api.non_geometric_line_layers:
+ return self._is_line_record_in_patch(token, layer_name, box_coords, mode)
+ else:
+ raise ValueError("{} is not a valid layer".format(layer_name))
+
+ def layers_on_point(self, x: float, y: float, layer_names: List[str] = None) -> Dict[str, str]:
+ """
+ Returns all the polygonal layers that a particular point is on.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :param layer_names: The names of the layers to search for.
+ :return: All the polygonal layers that a particular point is on.
+ """
+ # Default option.
+ if layer_names is None:
+ layer_names = self.map_api.non_geometric_polygon_layers
+
+ layers_on_point = dict()
+ for layer_name in layer_names:
+ layers_on_point.update({layer_name: self.record_on_point(x, y, layer_name)})
+
+ return layers_on_point
+
+ def record_on_point(self, x: float, y: float, layer_name: str) -> str:
+ """
+ Query what record of a layer a particular point is on.
+ :param x: x coordinate of the point of interest.
+ :param y: y coordinate of the point of interest.
+ :param layer_name: The non geometric polygonal layer name that we are interested in.
+ :return: The first token of a layer a particular point is on or '' if no layer is found.
+ """
+ if layer_name not in self.map_api.non_geometric_polygon_layers:
+ raise ValueError("{} is not a polygon layer".format(layer_name))
+
+ point = Point(x, y)
+ records = getattr(self.map_api, layer_name)
+
+ if layer_name == 'drivable_area':
+ for record in records:
+ polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+ for polygon in polygons:
+ if point.within(polygon):
+ return record['token']
+ else:
+ pass
+ else:
+ for record in records:
+ polygon = self.map_api.extract_polygon(record['polygon_token'])
+ if point.within(polygon):
+ return record['token']
+ else:
+ pass
+
+ # If nothing is found, return an empty string.
+ return ''
+
+ def extract_polygon(self, polygon_token: str) -> Polygon:
+ """
+ Construct a shapely Polygon object out of a polygon token.
+ :param polygon_token: The token of the polygon record.
+ :return: The polygon wrapped in a shapely Polygon object.
+ """
+ polygon_record = self.map_api.get('polygon', polygon_token)
+
+ exterior_coords = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y'])
+ for token in polygon_record['exterior_node_tokens']]
+
+ interiors = []
+ for hole in polygon_record['holes']:
+ interior_coords = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y'])
+ for token in hole['node_tokens']]
+ if len(interior_coords) > 0: # Add only non-empty holes.
+ interiors.append(interior_coords)
+
+ return Polygon(exterior_coords, interiors)
+
+ def extract_line(self, line_token: str) -> LineString:
+ """
+ Construct a shapely LineString object out of a line token.
+ :param line_token: The token of the line record.
+ :return: The line wrapped in a LineString object.
+ """
+ line_record = self.map_api.get('line', line_token)
+ line_nodes = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y'])
+ for token in line_record['node_tokens']]
+
+ return LineString(line_nodes)
+
+ def get_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+ """
+ Get the bounds of the geometric object that corresponds to a non geometric record.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record.
+ :return: min_x, min_y, max_x, max_y of the line representation.
+ """
+ if layer_name in self.map_api.non_geometric_polygon_layers:
+ return self._get_polygon_bounds(layer_name, token)
+ elif layer_name in self.map_api.non_geometric_line_layers:
+ return self._get_line_bounds(layer_name, token)
+ else:
+ raise ValueError("{} is not a valid layer".format(layer_name))
+
+ def _get_polygon_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+ """
+ Get the extremities of the polygon object that corresponds to a non geometric record.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record.
+ :return: min_x, min_y, max_x, max_y of of the polygon or polygons (for drivable_area) representation.
+ """
+ if layer_name not in self.map_api.non_geometric_polygon_layers:
+ raise ValueError("{} is not a record with polygon representation".format(token))
+
+ record = self.map_api.get(layer_name, token)
+
+ if layer_name == 'drivable_area':
+ polygons = [self.map_api.get('polygon', polygon_token) for polygon_token in record['polygon_tokens']]
+ exterior_node_coords = []
+
+ for polygon in polygons:
+ nodes = [self.map_api.get('node', node_token) for node_token in polygon['exterior_node_tokens']]
+ node_coords = [(node['x'], node['y']) for node in nodes]
+ exterior_node_coords.extend(node_coords)
+
+ exterior_node_coords = np.array(exterior_node_coords)
+ else:
+ exterior_nodes = [self.map_api.get('node', token) for token in record['exterior_node_tokens']]
+ exterior_node_coords = np.array([(node['x'], node['y']) for node in exterior_nodes])
+
+ xs = exterior_node_coords[:, 0]
+ ys = exterior_node_coords[:, 1]
+
+ x2 = xs.max()
+ x1 = xs.min()
+ y2 = ys.max()
+ y1 = ys.min()
+
+ return x1, y1, x2, y2
+
+ def _get_line_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+ """
+ Get the bounds of the line object that corresponds to a non geometric record.
+ :param layer_name: Name of the layer that we are interested in.
+ :param token: Token of the record.
+ :return: min_x, min_y, max_x, max_y of of the line representation.
+ """
+ if layer_name not in self.map_api.non_geometric_line_layers:
+ raise ValueError("{} is not a record with line representation".format(token))
+
+ record = self.map_api.get(layer_name, token)
+ nodes = [self.map_api.get('node', node_token) for node_token in record['node_tokens']]
+ node_coords = [(node['x'], node['y']) for node in nodes]
+ node_coords = np.array(node_coords)
+
+ xs = node_coords[:, 0]
+ ys = node_coords[:, 1]
+
+ x2 = xs.max()
+ x1 = xs.min()
+ y2 = ys.max()
+ y1 = ys.min()
+
+ return x1, y1, x2, y2
+
+ def _is_polygon_record_in_patch(self,
+ token: str,
+ layer_name: str,
+ box_coords: Tuple[float, float, float, float],
+ mode: str = 'intersect') -> bool:
+ """
+ Query whether a particular polygon record is in a rectangular patch.
+ :param layer_name: The layer name of the record.
+ :param token: The record token.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param mode: "intersect" means it will return True if the geometric object intersects the patch and False
+ otherwise, "within" will return True if the geometric object is within the patch and False otherwise.
+ :return: Boolean value on whether a particular polygon record intersects or is within a particular patch.
+ """
+ if layer_name not in self.map_api.lookup_polygon_layers:
+ raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+ x_min, y_min, x_max, y_max = box_coords
+ record = self.map_api.get(layer_name, token)
+ rectangular_patch = box(x_min, y_min, x_max, y_max)
+
+ if layer_name == 'drivable_area':
+ polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+ geom = MultiPolygon(polygons)
+ else:
+ geom = self.map_api.extract_polygon(record['polygon_token'])
+
+ if mode == 'intersect':
+ return geom.intersects(rectangular_patch)
+ elif mode == 'within':
+ return geom.within(rectangular_patch)
+
+ def _is_line_record_in_patch(self,
+ token: str,
+ layer_name: str,
+ box_coords: Tuple[float, float, float, float],
+ mode: str = 'intersect') -> bool:
+ """
+ Query whether a particular line record is in a rectangular patch.
+ :param layer_name: The layer name of the record.
+ :param token: The record token.
+ :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+ :param mode: "intersect" means it will return True if the geometric object intersects the patch and False
+ otherwise, "within" will return True if the geometric object is within the patch and False otherwise.
+ :return: Boolean value on whether a particular line record intersects or is within a particular patch.
+ """
+ if layer_name not in self.map_api.non_geometric_line_layers:
+ raise ValueError("{} is not a line layer".format(layer_name))
+
+ # Retrieve nodes of this line.
+ record = self.map_api.get(layer_name, token)
+ node_recs = [self.map_api.get('node', node_token) for node_token in record['node_tokens']]
+ node_coords = [[node['x'], node['y']] for node in node_recs]
+ node_coords = np.array(node_coords)
+
+ # A few lines in Queenstown have zero nodes. In this case we return False.
+ if len(node_coords) == 0:
+ return False
+
+ # Check that nodes fall inside the path.
+ x_min, y_min, x_max, y_max = box_coords
+ cond_x = np.logical_and(node_coords[:, 0] < x_max, node_coords[:, 0] > x_min)
+ cond_y = np.logical_and(node_coords[:, 1] < y_max, node_coords[:, 1] > y_min)
+ cond = np.logical_and(cond_x, cond_y)
+ if mode == 'intersect':
+ return np.any(cond)
+ elif mode == 'within':
+ return np.all(cond)
+
+ def _render_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None:
+ """
+ Wrapper method that renders individual layers on an axis.
+ :param ax: The matplotlib axes where the layer will get rendered.
+ :param layer_name: Name of the layer that we are interested in.
+ :param alpha: The opacity of the layer to be rendered.
+ :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+ """
+ if layer_name in self.map_api.non_geometric_polygon_layers:
+ self._render_polygon_layer(ax, layer_name, alpha, tokens)
+ elif layer_name in self.map_api.non_geometric_line_layers:
+ self._render_line_layer(ax, layer_name, alpha, tokens)
+ else:
+ raise ValueError("{} is not a valid layer".format(layer_name))
+
+ def _render_polygon_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None:
+ """
+ Renders an individual non-geometric polygon layer on an axis.
+ :param ax: The matplotlib axes where the layer will get rendered.
+ :param layer_name: Name of the layer that we are interested in.
+ :param alpha: The opacity of the layer to be rendered.
+ :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+ """
+ if layer_name not in self.map_api.non_geometric_polygon_layers:
+ raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+ first_time = True
+ records = getattr(self.map_api, layer_name)
+ if tokens is not None:
+ records = [r for r in records if r['token'] in tokens]
+ if layer_name == 'drivable_area':
+ for record in records:
+ polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+ for polygon in polygons:
+ if first_time:
+ label = layer_name
+ first_time = False
+ else:
+ label = None
+ ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha,
+ label=label))
+ else:
+ for record in records:
+ polygon = self.map_api.extract_polygon(record['polygon_token'])
+
+ if first_time:
+ label = layer_name
+ first_time = False
+ else:
+ label = None
+
+ ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha,
+ label=label))
+
+ def _render_line_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None:
+ """
+ Renders an individual non-geometric line layer on an axis.
+ :param ax: The matplotlib axes where the layer will get rendered.
+ :param layer_name: Name of the layer that we are interested in.
+ :param alpha: The opacity of the layer to be rendered.
+ :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+ """
+ if layer_name not in self.map_api.non_geometric_line_layers:
+ raise ValueError("{} is not a line layer".format(layer_name))
+
+ first_time = True
+ records = getattr(self.map_api, layer_name)
+ if tokens is not None:
+ records = [r for r in records if r['token'] in tokens]
+ for record in records:
+ if first_time:
+ label = layer_name
+ first_time = False
+ else:
+ label = None
+ line = self.map_api.extract_line(record['line_token'])
+ if line.is_empty: # Skip lines without nodes
+ continue
+ xs, ys = line.xy
+
+ if layer_name == 'traffic_light':
+ # Draws an arrow with the physical traffic light as the starting point, pointing to the direction on
+ # where the traffic light points.
+ ax.add_patch(Arrow(xs[0], ys[0], xs[1]-xs[0], ys[1]-ys[0], color=self.color_map[layer_name],
+ label=label))
+ else:
+ ax.plot(xs, ys, color=self.color_map[layer_name], alpha=alpha, label=label)
+
+ def _get_layer_geom(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_name: str) -> List[Geometry]:
+ """
+ Wrapper method that gets the geometries for each layer.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ :param layer_name: Name of map layer to be converted to binary map mask patch.
+ :return: List of geometries for the given layer.
+ """
+ if layer_name in self.map_api.non_geometric_polygon_layers:
+ return self._get_layer_polygon(patch_box, patch_angle, layer_name)
+ elif layer_name in self.map_api.non_geometric_line_layers:
+ return self._get_layer_line(patch_box, patch_angle, layer_name)
+ else:
+ raise ValueError("{} is not a valid layer".format(layer_name))
+
+ def _layer_geom_to_mask(self,
+ layer_name: str,
+ layer_geom: List[Geometry],
+ local_box: Tuple[float, float, float, float],
+ canvas_size: Tuple[int, int]) -> np.ndarray:
+ """
+ Wrapper method that gets the mask for each layer's geometries.
+ :param layer_name: The name of the layer for which we get the masks.
+ :param layer_geom: List of the geometries of the layer specified in layer_name.
+ :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+ x_center = y_center = 0.
+ :param canvas_size: Size of the output mask (h, w).
+ """
+ if layer_name in self.map_api.non_geometric_polygon_layers:
+ return self._polygon_geom_to_mask(layer_geom, local_box, layer_name, canvas_size)
+ elif layer_name in self.map_api.non_geometric_line_layers:
+ return self._line_geom_to_mask(layer_geom, local_box, layer_name, canvas_size)
+ else:
+ raise ValueError("{} is not a valid layer".format(layer_name))
+
+ @staticmethod
+ def mask_for_polygons(polygons: MultiPolygon, mask: np.ndarray) -> np.ndarray:
+ """
+ Convert a polygon or multipolygon list to an image mask ndarray.
+ :param polygons: List of Shapely polygons to be converted to numpy array.
+ :param mask: Canvas where mask will be generated.
+ :return: Numpy ndarray polygon mask.
+ """
+ if not polygons:
+ return mask
+
+ def int_coords(x):
+ # function to round and convert to int
+ return np.array(x).round().astype(np.int32)
+ exteriors = [int_coords(poly.exterior.coords) for poly in polygons]
+ interiors = [int_coords(pi.coords) for poly in polygons for pi in poly.interiors]
+ cv2.fillPoly(mask, exteriors, 1)
+ cv2.fillPoly(mask, interiors, 0)
+ return mask
+
+ @staticmethod
+ def mask_for_lines(lines: LineString, mask: np.ndarray) -> np.ndarray:
+ """
+ Convert a Shapely LineString back to an image mask ndarray.
+ :param lines: List of shapely LineStrings to be converted to a numpy array.
+ :param mask: Canvas where mask will be generated.
+ :return: Numpy ndarray line mask.
+ """
+ if lines.geom_type == 'MultiLineString':
+ for line in lines:
+ coords = np.asarray(list(line.coords), np.int32)
+ coords = coords.reshape((-1, 2))
+ cv2.polylines(mask, [coords], False, 1, 2)
+ else:
+ coords = np.asarray(list(lines.coords), np.int32)
+ coords = coords.reshape((-1, 2))
+ cv2.polylines(mask, [coords], False, 1, 2)
+
+ return mask
+
+ def _polygon_geom_to_mask(self,
+ layer_geom: List[Polygon],
+ local_box: Tuple[float, float, float, float],
+ layer_name: str,
+ canvas_size: Tuple[int, int]) -> np.ndarray:
+ """
+ Convert polygon inside patch to binary mask and return the map patch.
+ :param layer_geom: list of polygons for each map layer
+ :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+ x_center = y_center = 0.
+ :param layer_name: name of map layer to be converted to binary map mask patch.
+ :param canvas_size: Size of the output mask (h, w).
+ :return: Binary map mask patch with the size canvas_size.
+ """
+ if layer_name not in self.map_api.non_geometric_polygon_layers:
+ raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+ patch_x, patch_y, patch_h, patch_w = local_box
+
+ patch = self.get_patch_coord(local_box)
+
+ canvas_h = canvas_size[0]
+ canvas_w = canvas_size[1]
+
+ scale_height = canvas_h / patch_h
+ scale_width = canvas_w / patch_w
+
+ trans_x = -patch_x + patch_w / 2.0
+ trans_y = -patch_y + patch_h / 2.0
+
+ map_mask = np.zeros(canvas_size, np.uint8)
+
+ for polygon in layer_geom:
+ new_polygon = polygon.intersection(patch)
+ if not new_polygon.is_empty:
+ new_polygon = affinity.affine_transform(new_polygon,
+ [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
+ new_polygon = affinity.scale(new_polygon, xfact=scale_width, yfact=scale_height, origin=(0, 0))
+
+ if new_polygon.geom_type == 'Polygon':
+ new_polygon = MultiPolygon([new_polygon])
+
+ # if new_polygon.area < 1000:
+ # continue
+
+ if not isinstance(new_polygon, MultiPolygon):
+ print(new_polygon)
+
+ continue
+
+ map_mask = self.mask_for_polygons(new_polygon, map_mask)
+
+ return map_mask
+
+ def _line_geom_to_mask(self,
+ layer_geom: List[LineString],
+ local_box: Tuple[float, float, float, float],
+ layer_name: str,
+ canvas_size: Tuple[int, int]) -> Optional[np.ndarray]:
+ """
+ Convert line inside patch to binary mask and return the map patch.
+ :param layer_geom: list of LineStrings for each map layer
+ :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+ x_center = y_center = 0.
+ :param layer_name: name of map layer to be converted to binary map mask patch.
+ :param canvas_size: Size of the output mask (h, w).
+ :return: Binary map mask patch in a canvas size.
+ """
+ if layer_name not in self.map_api.non_geometric_line_layers:
+ raise ValueError("{} is not a line layer".format(layer_name))
+
+ patch_x, patch_y, patch_h, patch_w = local_box
+
+ patch = self.get_patch_coord(local_box)
+
+ canvas_h = canvas_size[0]
+ canvas_w = canvas_size[1]
+ scale_height = canvas_h/patch_h
+ scale_width = canvas_w/patch_w
+
+ trans_x = -patch_x + patch_w / 2.0
+ trans_y = -patch_y + patch_h / 2.0
+
+ map_mask = np.zeros(canvas_size, np.uint8)
+
+ if layer_name == 'traffic_light':
+ return None
+
+ for line in layer_geom:
+ new_line = line.intersection(patch)
+ if not new_line.is_empty:
+ new_line = affinity.affine_transform(new_line,
+ [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
+ new_line = affinity.scale(new_line, xfact=scale_width, yfact=scale_height, origin=(0, 0))
+
+ map_mask = self.mask_for_lines(new_line, map_mask)
+ return map_mask
+
+ def _get_layer_polygon(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_name: str) -> List[Polygon]:
+ """
+ Retrieve the polygons of a particular layer within the specified patch.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ :param layer_name: name of map layer to be extracted.
+ :return: List of Polygon in a patch box.
+ """
+ if layer_name not in self.map_api.non_geometric_polygon_layers:
+ raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+ patch_x = patch_box[0]
+ patch_y = patch_box[1]
+
+ patch = self.get_patch_coord(patch_box, patch_angle)
+
+ records = getattr(self.map_api, layer_name)
+
+ polygon_list = []
+ if layer_name == 'drivable_area':
+ for record in records:
+ polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+ for polygon in polygons:
+ new_polygon = polygon.intersection(patch)
+ if not new_polygon.is_empty:
+ new_polygon = affinity.rotate(new_polygon, -patch_angle,
+ origin=(patch_x, patch_y), use_radians=False)
+ new_polygon = affinity.affine_transform(new_polygon,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ if new_polygon.geom_type == 'Polygon':
+ new_polygon = MultiPolygon([new_polygon])
+ polygon_list.append(new_polygon)
+
+ else:
+ for record in records:
+ polygon = self.map_api.extract_polygon(record['polygon_token'])
+
+ if polygon.is_valid:
+ new_polygon = polygon.intersection(patch)
+ if not new_polygon.is_empty:
+ new_polygon = affinity.rotate(new_polygon, -patch_angle,
+ origin=(patch_x, patch_y), use_radians=False)
+ new_polygon = affinity.affine_transform(new_polygon,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ if new_polygon.geom_type == 'Polygon':
+ new_polygon = MultiPolygon([new_polygon])
+ polygon_list.append(new_polygon)
+
+ return polygon_list
+
+ def _get_layer_line(self,
+ patch_box: Tuple[float, float, float, float],
+ patch_angle: float,
+ layer_name: str) -> Optional[List[LineString]]:
+ """
+ Retrieve the lines of a particular layer within the specified patch.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ :param layer_name: name of map layer to be converted to binary map mask patch.
+ :return: List of LineString in a patch box.
+ """
+ if layer_name not in self.map_api.non_geometric_line_layers:
+ raise ValueError("{} is not a line layer".format(layer_name))
+
+ if layer_name == 'traffic_light':
+ return None
+
+ patch_x = patch_box[0]
+ patch_y = patch_box[1]
+
+ patch = self.get_patch_coord(patch_box, patch_angle)
+
+ line_list = []
+ records = getattr(self.map_api, layer_name)
+ for record in records:
+ line = self.map_api.extract_line(record['line_token'])
+ if line.is_empty: # Skip lines without nodes.
+ continue
+
+ new_line = line.intersection(patch)
+ if not new_line.is_empty:
+ new_line = affinity.rotate(new_line, -patch_angle,
+ origin=(patch_x, patch_y), use_radians=False)
+ new_line = affinity.affine_transform(new_line,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ line_list.append(new_line)
+
+ return line_list
+
+ @staticmethod
+ def get_patch_coord(patch_box: Tuple[float, float, float, float],
+ patch_angle: float = 0.0) -> Polygon:
+ """
+ Convert patch_box to shapely Polygon coordinates.
+ :param patch_box: Patch box defined as [x_center, y_center, height, width].
+ :param patch_angle: Patch orientation in degrees.
+ :return: Box Polygon for patch_box.
+ """
+ patch_x, patch_y, patch_h, patch_w = patch_box
+
+ x_min = patch_x - patch_w / 2.0
+ y_min = patch_y - patch_h / 2.0
+ x_max = patch_x + patch_w / 2.0
+ y_max = patch_y + patch_h / 2.0
+
+ patch = box(x_min, y_min, x_max, y_max)
+ patch = affinity.rotate(patch, patch_angle, origin=(patch_x, patch_y), use_radians=False)
+
+ return patch
+
+ def _get_figsize(self, figsize: Union[None, float, Tuple[float, float]]) -> Tuple[float, float]:
+ """
+ Utility function that scales the figure size by the map canvas size.
+ If figsize is:
+ - None => Return default scale.
+ - Scalar => Scale canvas size.
+ - Two-tuple => Use the specified figure size.
+ :param figsize: The input figure size.
+ :return: The output figure size.
+ """
+ # Divide canvas size by arbitrary scalar to get into cm range.
+ canvas_size = np.array(self.map_api.canvas_edge)[::-1] / 200
+
+ if figsize is None:
+ return tuple(canvas_size)
+ elif type(figsize) in [int, float]:
+ return tuple(canvas_size * figsize)
+ elif type(figsize) == tuple and len(figsize) == 2:
+ return figsize
+ else:
+ raise Exception('Error: Invalid figsize: %s' % figsize)
diff --git a/mmcv/datasets/eval_utils/metric_utils.py b/mmcv/datasets/eval_utils/metric_utils.py
new file mode 100644
index 0000000..1058703
--- /dev/null
+++ b/mmcv/datasets/eval_utils/metric_utils.py
@@ -0,0 +1,104 @@
+import torch
+import math
+import numpy as np
+from typing import List, Dict, Tuple, Callable, Union
+
+def min_ade(traj: torch.Tensor, traj_gt: torch.Tensor,
+ masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Computes average displacement error for the best trajectory is a set,
+ with respect to ground truth
+ :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+ :param traj_gt: ground truth trajectory, shape
+ [batch_size, sequence_length, 2]
+ :param masks: masks for varying length ground truth, shape
+ [batch_size, sequence_length]
+ :return errs, inds: errors and indices for modes with min error, shape
+ [batch_size]
+ """
+ num_modes = traj.shape[1]
+ traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+ masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1)
+ err = traj_gt_rpt - traj[:, :, :, 0:2]
+ err = torch.pow(err, exponent=2)
+ err = torch.sum(err, dim=3)
+ err = torch.pow(err, exponent=0.5)
+ err = torch.sum(err * (1 - masks_rpt), dim=2) / \
+ torch.clip(torch.sum((1 - masks_rpt), dim=2), min=1)
+ err, inds = torch.min(err, dim=1)
+
+ return err, inds
+
+
+def min_fde(traj: torch.Tensor, traj_gt: torch.Tensor,
+ masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Computes final displacement error for the best trajectory is a set,
+ with respect to ground truth
+ :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+ :param traj_gt: ground truth trajectory, shape
+ [batch_size, sequence_length, 2]
+ :param masks: masks for varying length ground truth, shape
+ [batch_size, sequence_length]
+ :return errs, inds: errors and indices for modes with min error,
+ shape [batch_size]
+ """
+ num_modes = traj.shape[1]
+ traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+ lengths = torch.sum(1 - masks, dim=1).long()
+ inds = lengths.unsqueeze(1).unsqueeze(
+ 2).unsqueeze(3).repeat(1, num_modes, 1, 2) - 1
+
+ traj_last = torch.gather(traj[..., :2], dim=2, index=inds).squeeze(2)
+ traj_gt_last = torch.gather(traj_gt_rpt, dim=2, index=inds).squeeze(2)
+
+ err = traj_gt_last - traj_last[..., 0:2]
+ err = torch.pow(err, exponent=2)
+ err = torch.sum(err, dim=2)
+ err = torch.pow(err, exponent=0.5)
+ err, inds = torch.min(err, dim=1)
+
+ return err, inds
+
+
+def miss_rate(
+ traj: torch.Tensor,
+ traj_gt: torch.Tensor,
+ masks: torch.Tensor,
+ dist_thresh: float = 2) -> torch.Tensor:
+ """
+ Computes miss rate for mini batch of trajectories,
+ with respect to ground truth and given distance threshold
+ :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+ :param traj_gt: ground truth trajectory,
+ shape [batch_size, sequence_length, 2]
+ :param masks: masks for varying length ground truth,
+ shape [batch_size, sequence_length]
+ :param dist_thresh: distance threshold for computing miss rate.
+ :return errs, inds: errors and indices for modes with min error,
+ shape [batch_size]
+ """
+ num_modes = traj.shape[1]
+
+ traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+ masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1)
+ dist = traj_gt_rpt - traj[:, :, :, 0:2]
+ dist = torch.pow(dist, exponent=2)
+ dist = torch.sum(dist, dim=3)
+ dist = torch.pow(dist, exponent=0.5)
+ dist[masks_rpt.bool()] = -math.inf
+ dist, _ = torch.max(dist, dim=2)
+ dist, _ = torch.min(dist, dim=1)
+ m_r = torch.sum(torch.as_tensor(dist > dist_thresh)) / len(dist)
+
+ return m_r
+
+def traj_fde(gt_box, pred_box, final_step):
+ if gt_box.traj.shape[0] <= 0:
+ return np.inf
+ final_step = min(gt_box.traj.shape[0], final_step)
+ gt_final = gt_box.traj[None, final_step-1]
+ pred_final = np.array(pred_box.traj)[:,final_step-1,:]
+ err = gt_final - pred_final
+ err = np.sqrt(np.sum(np.square(gt_final - pred_final), axis=-1))
+ return np.min(err)
\ No newline at end of file
diff --git a/mmcv/datasets/eval_utils/nuscenes_eval.py b/mmcv/datasets/eval_utils/nuscenes_eval.py
new file mode 100644
index 0000000..48a136c
--- /dev/null
+++ b/mmcv/datasets/eval_utils/nuscenes_eval.py
@@ -0,0 +1,705 @@
+import argparse
+import copy
+import json
+import numpy as np
+import os
+import time
+from typing import Tuple, Dict, Any
+import tqdm
+from matplotlib import pyplot as plt
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS, TP_METRICS_UNITS, PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, DetectionMetricDataList
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import view_points, BoxVisibility
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.detection.utils import category_to_detection_name
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+ metrics: DetectionMetrics,
+ detection_name: str,
+ min_recall: float,
+ dist_th_tp: float,
+ savepath: str = None,
+ ax: Axis = None) -> None:
+ """
+ Plot the true positive curve for the specified class.
+ :param md_list: DetectionMetricDataList instance.
+ :param metrics: DetectionMetrics instance.
+ :param detection_name:
+ :param min_recall: Minimum recall value.
+ :param dist_th_tp: The distance threshold used to determine matches.
+ :param savepath: If given, saves the the rendering here instead of displaying.
+ :param ax: Axes onto which to render.
+ """
+ # Get metric data for given detection class with tp distance threshold.
+
+ md = md_list[(detection_name, dist_th_tp)]
+ min_recall_ind = round(100 * min_recall)
+ if min_recall_ind <= md.max_recall_ind:
+ # For traffic_cone and barrier only a subset of the metrics are plotted.
+ rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+ ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+ else:
+ ylimit = 1.0
+
+ # Prepare axis.
+ if ax is None:
+ ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+ min_recall=min_recall)
+ ax.set_ylim(0, ylimit)
+
+ # Plot the recall vs. error curve for each tp metric.
+ for metric in TP_METRICS:
+ tp = metrics.get_label_tp(detection_name, metric)
+
+ # Plot only if we have valid data.
+ if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+ recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+ else:
+ recall, error = [], []
+
+ # Change legend based on tp value
+ if tp is np.nan:
+ label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+ elif min_recall_ind > md.max_recall_ind:
+ label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+ else:
+ label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+ if metric == 'trans_err':
+ label += f' ({md.max_recall_ind})' # add recall
+ print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+ ax.plot(recall, error, label=label)
+ ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+ ax.legend(loc='best')
+
+ if savepath is not None:
+ plt.savefig(savepath)
+ plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+ def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+ '''
+ add annotation token
+ '''
+ super().__init__(*args, **kwargs)
+ self.token = token
+ self.visibility = visibility
+ self.index = index
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'token': self.token,
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'visibility': self.visibility,
+ 'index': self.index
+
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(
+ token=content['token'],
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'],
+ visibility=content['visibility'],
+ index=content['index'],
+ )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible inside an image without accounting for occlusions.
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ center_3d = box.center.reshape(3, 1)
+ center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, center_img[1, :] > 0)
+ visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+ in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if vis_level == BoxVisibility.ALL:
+ return all(visible) and all(in_front)
+ elif vis_level == BoxVisibility.ANY:
+ return any(visible) and all(in_front)
+ elif vis_level == BoxVisibility.NONE:
+ return True
+ else:
+ raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+ vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible in images but not all corners in image .
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ corners_3d = box.corners()
+ corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, corners_img[1, :] > 0)
+ visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+ in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if any(visible) and not all(visible) and all(in_front):
+ return True
+ else:
+ return False
+
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+ """
+ Loads ground truth boxes from DB.
+ :param nusc: A NuScenes instance.
+ :param eval_split: The evaluation split for which we load GT boxes.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The GT boxes.
+ """
+
+ # Init.
+ if box_cls == DetectionBox_modified:
+ attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+ if verbose:
+ print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+ # Read out all sample_tokens in DB.
+ sample_tokens_all = [s['token'] for s in nusc.sample]
+ assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+ # Only keep samples from this split.
+ splits = create_splits_scenes()
+
+ # Check compatibility of split with nusc_version.
+ version = nusc.version
+ if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+ assert version.endswith('trainval'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split in {'mini_train', 'mini_val'}:
+ assert version.endswith('mini'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split == 'test':
+ assert version.endswith('test'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ else:
+ raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+ .format(eval_split))
+
+ if eval_split == 'test':
+ # Check that you aren't trying to cheat :).
+ assert len(nusc.sample_annotation) > 0, \
+ 'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+ index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ index_map[sample['token']] = index
+ index += 1
+
+ sample_tokens = []
+ for sample_token in sample_tokens_all:
+ scene_token = nusc.get('sample', sample_token)['scene_token']
+ scene_record = nusc.get('scene', scene_token)
+ if scene_record['name'] in splits[eval_split]:
+ sample_tokens.append(sample_token)
+
+ all_annotations = EvalBoxes()
+
+ # Load annotations and filter predictions and annotations.
+ tracking_id_set = set()
+ for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+ sample = nusc.get('sample', sample_token)
+ sample_annotation_tokens = sample['anns']
+
+ sample_boxes = []
+ for sample_annotation_token in sample_annotation_tokens:
+
+ sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+ if box_cls == DetectionBox_modified:
+ # Get label name in detection task and filter unused labels.
+ detection_name = category_to_detection_name(sample_annotation['category_name'])
+ if detection_name is None:
+ continue
+
+ # Get attribute_name.
+ attr_tokens = sample_annotation['attribute_tokens']
+ attr_count = len(attr_tokens)
+ if attr_count == 0:
+ attribute_name = ''
+ elif attr_count == 1:
+ attribute_name = attribute_map[attr_tokens[0]]
+ else:
+ raise Exception('Error: GT annotations must not have more than one attribute!')
+
+ sample_boxes.append(
+ box_cls(
+ token=sample_annotation_token,
+ sample_token=sample_token,
+ translation=sample_annotation['translation'],
+ size=sample_annotation['size'],
+ rotation=sample_annotation['rotation'],
+ velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+ num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+ detection_name=detection_name,
+ detection_score=-1.0, # GT samples do not have a score.
+ attribute_name=attribute_name,
+ visibility=sample_annotation['visibility_token'],
+ index=index_map[sample_token]
+ )
+ )
+ elif box_cls == TrackingBox:
+ assert False
+ else:
+ raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+ all_annotations.add_boxes(sample_token, sample_boxes)
+
+ if verbose:
+ print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+ return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ id=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.token in id:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+ ori_eval_boxes: EvalBoxes,
+ visibility=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.visibility == visibility:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After visibility based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False):
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ for sample_token in eval_boxes.sample_tokens:
+ if sample_token not in valid_sample_tokens:
+ eval_boxes.boxes.pop(sample_token)
+ return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. basedon overlap .
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ sample_record = nusc.get('sample', sample_token)
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ count = 0
+ for cam in cams:
+ '''
+ copy-paste form nuscens
+ '''
+ sample_data_token = sample_record['data'][cam]
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+ name=box.detection_name, token='')
+
+ # Move box to ego vehicle coord system.
+ new_box.translate(-np.array(pose_record['translation']))
+ new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ new_box.translate(-np.array(cs_record['translation']))
+ new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ count += 1
+ # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ # count += 1
+
+ if count > 1:
+ with open('center_overlap.txt', 'a') as f:
+ try:
+ f.write(box.token + '\n')
+ except:
+ pass
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ verbose = True
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+ """
+ Dummy class for backward-compatibility. Same as DetectionEval.
+ """
+
+ def __init__(self,
+ nusc: NuScenes,
+ config: DetectionConfig,
+ result_path: str,
+ eval_set: str,
+ output_dir: str = None,
+ verbose: bool = True,
+ overlap_test=False,
+ eval_mask=False,
+ data_infos=None
+ ):
+ """
+ Initialize a DetectionEval object.
+ :param nusc: A NuScenes object.
+ :param config: A DetectionConfig object.
+ :param result_path: Path of the nuScenes JSON result file.
+ :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+ :param output_dir: Folder to save plots and results to.
+ :param verbose: Whether to print to stdout.
+ """
+
+ self.nusc = nusc
+ self.result_path = result_path
+ self.eval_set = eval_set
+ self.output_dir = output_dir
+ self.verbose = verbose
+ self.cfg = config
+ self.overlap_test = overlap_test
+ self.eval_mask = eval_mask
+ self.data_infos = data_infos
+ # Check result file exists.
+ assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+ # Make dirs.
+ self.plot_dir = os.path.join(self.output_dir, 'plots')
+ if not os.path.isdir(self.output_dir):
+ os.makedirs(self.output_dir)
+ if not os.path.isdir(self.plot_dir):
+ os.makedirs(self.plot_dir)
+
+ # Load data.
+ if verbose:
+ print('Initializing nuScenes detection evaluation')
+ self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+ verbose=verbose)
+ self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+ # assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+ # "Samples in split doesn't match samples in predictions."
+
+ # Add center distances.
+ self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+ self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+ # Filter boxes (distance, points per box, etc.).
+
+ if verbose:
+ print('Filtering predictions')
+ self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+ if verbose:
+ print('Filtering ground truth annotations')
+ self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+ if self.overlap_test:
+ self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+ self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+ self.all_gt = copy.deepcopy(self.gt_boxes)
+ self.all_preds = copy.deepcopy(self.pred_boxes)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+ self.index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ self.index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ self.index_map[sample['token']] = index
+ index += 1
+
+ def update_gt(self, type_='vis', visibility='1', index=1):
+ if type_ == 'vis':
+ self.visibility_test = True
+ if self.visibility_test:
+ '''[{'description': 'visibility of whole object is between 0 and 40%',
+ 'token': '1',
+ 'level': 'v0-40'},
+ {'description': 'visibility of whole object is between 40 and 60%',
+ 'token': '2',
+ 'level': 'v40-60'},
+ {'description': 'visibility of whole object is between 60 and 80%',
+ 'token': '3',
+ 'level': 'v60-80'},
+ {'description': 'visibility of whole object is between 80 and 100%',
+ 'token': '4',
+ 'level': 'v80-100'}]'''
+
+ self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+ elif type_ == 'ord':
+
+ valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+ # from IPython import embed
+ # embed()
+ self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+ self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+ def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMetricDataList()
+
+ # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+ # self.cfg.dist_ths = [0.3]
+ # self.cfg.dist_fcn_callable
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMetrics(self.cfg)
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+ """
+ Renders various PR and TP curves.
+ :param metrics: DetectionMetrics instance.
+ :param md_list: DetectionMetricDataList instance.
+ """
+ if self.verbose:
+ print('Rendering PR and TP curves')
+
+ def savepath(name):
+ return os.path.join(self.plot_dir, name + '.pdf')
+
+ summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+ dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+ for detection_name in self.cfg.class_names:
+ class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath(detection_name + '_pr'))
+
+ class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+ savepath=savepath(detection_name + '_tp'))
+
+ for dist_th in self.cfg.dist_ths:
+ dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+ # Settings.
+ parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+ parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+ help='Folder to store result metrics, graphs and example visualizations.')
+ parser.add_argument('--eval_set', type=str, default='val',
+ help='Which dataset split to evaluate on, train, val or test.')
+ parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+ help='Default nuScenes data directory.')
+ parser.add_argument('--version', type=str, default='v1.0-trainval',
+ help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+ parser.add_argument('--config_path', type=str, default='',
+ help='Path to the configuration file.'
+ 'If no path given, the CVPR 2019 configuration will be used.')
+ parser.add_argument('--plot_examples', type=int, default=0,
+ help='How many example visualizations to write to disk.')
+ parser.add_argument('--render_curves', type=int, default=1,
+ help='Whether to render PR and TP curves to disk.')
+ parser.add_argument('--verbose', type=int, default=1,
+ help='Whether to print to stdout.')
+ args = parser.parse_args()
+
+ result_path_ = os.path.expanduser(args.result_path)
+ output_dir_ = os.path.expanduser(args.output_dir)
+ eval_set_ = args.eval_set
+ dataroot_ = args.dataroot
+ version_ = args.version
+ config_path = args.config_path
+ plot_examples_ = args.plot_examples
+ render_curves_ = bool(args.render_curves)
+ verbose_ = bool(args.verbose)
+
+ if config_path == '':
+ cfg_ = config_factory('detection_cvpr_2019')
+ else:
+ with open(config_path, 'r') as _f:
+ cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+ nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+ nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+ output_dir=output_dir_, verbose=verbose_)
+ for vis in ['1', '2', '3', '4']:
+ nusc_eval.update_gt(type_='vis', visibility=vis)
+ print(f'================ {vis} ===============')
+ nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
diff --git a/mmcv/datasets/eval_utils/nuscenes_eval_motion.py b/mmcv/datasets/eval_utils/nuscenes_eval_motion.py
new file mode 100644
index 0000000..8ff66f0
--- /dev/null
+++ b/mmcv/datasets/eval_utils/nuscenes_eval_motion.py
@@ -0,0 +1,933 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.utils.data_classes import Box
+from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, BoxVisibility
+import pycocotools.mask as mask_util
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+ DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmcv.core.bbox.iou_calculators import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+ PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+from .eval_utils import load_prediction, load_gt, accumulate, accumulate_motion, \
+ DetectionMotionBox, DetectionMotionBox_modified, DetectionMotionMetricData, \
+ DetectionMotionMetrics, DetectionMotionMetricDataList
+from .metric_utils import traj_fde
+from prettytable import PrettyTable
+
+TP_METRICS = [
+ 'trans_err',
+ 'scale_err',
+ 'orient_err',
+ 'vel_err',
+ 'attr_err',
+ 'min_ade_err',
+ 'min_fde_err',
+ 'miss_rate_err']
+TP_TRAJ_METRICS = ['min_ade_err', 'min_fde_err', 'miss_rate_err']
+Axis = Any
+
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+ metrics: DetectionMetrics,
+ detection_name: str,
+ min_recall: float,
+ dist_th_tp: float,
+ savepath: str = None,
+ ax: Axis = None) -> None:
+ """
+ Plot the true positive curve for the specified class.
+ :param md_list: DetectionMetricDataList instance.
+ :param metrics: DetectionMetrics instance.
+ :param detection_name:
+ :param min_recall: Minimum recall value.
+ :param dist_th_tp: The distance threshold used to determine matches.
+ :param savepath: If given, saves the the rendering here instead of displaying.
+ :param ax: Axes onto which to render.
+ """
+ # Get metric data for given detection class with tp distance threshold.
+
+ md = md_list[(detection_name, dist_th_tp)]
+ min_recall_ind = round(100 * min_recall)
+ if min_recall_ind <= md.max_recall_ind:
+ # For traffic_cone and barrier only a subset of the metrics are
+ # plotted.
+ rel_metrics = [
+ m for m in TP_METRICS if not np.isnan(
+ metrics.get_label_tp(
+ detection_name, m))]
+ ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1])
+ for metric in rel_metrics]) * 1.1
+ else:
+ ylimit = 1.0
+
+ # Prepare axis.
+ if ax is None:
+ ax = setup_axis(
+ title=PRETTY_DETECTION_NAMES[detection_name],
+ xlabel='Recall',
+ ylabel='Error',
+ xlim=1,
+ min_recall=min_recall)
+ ax.set_ylim(0, ylimit)
+
+ # Plot the recall vs. error curve for each tp metric.
+ for metric in TP_METRICS:
+ tp = metrics.get_label_tp(detection_name, metric)
+
+ # Plot only if we have valid data.
+ if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+ recall, error = md.recall[:md.max_recall_ind +
+ 1], getattr(md, metric)[:md.max_recall_ind + 1]
+ else:
+ recall, error = [], []
+
+ # Change legend based on tp value
+ if tp is np.nan:
+ label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+ elif min_recall_ind > md.max_recall_ind:
+ label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+ else:
+ label = '{}: {:.2f} ({})'.format(
+ PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+ if metric == 'trans_err':
+ label += f' ({md.max_recall_ind})' # add recall
+ print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+ ax.plot(recall, error, label=label)
+ ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+ ax.legend(loc='best')
+
+ if savepath is not None:
+ plt.savefig(savepath)
+ plt.close()
+
+
+def center_in_image(box,
+ intrinsic: np.ndarray,
+ imsize: Tuple[int,
+ int],
+ vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible inside an image without accounting for occlusions.
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ center_3d = box.center.reshape(3, 1)
+ center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(
+ center_img[0, :] > 0, center_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, center_img[1, :] > 0)
+ visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+ # True if a corner is at least 0.1 meter in front of the camera.
+ in_front = center_3d[2, :] > 0.1
+
+ if vis_level == BoxVisibility.ALL:
+ return all(visible) and all(in_front)
+ elif vis_level == BoxVisibility.ANY:
+ return any(visible) and all(in_front)
+ elif vis_level == BoxVisibility.NONE:
+ return True
+ else:
+ raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box,
+ intrinsic: np.ndarray,
+ imsize: Tuple[int,
+ int],
+ vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible in images but not all corners in image .
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ corners_3d = box.corners()
+ corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(
+ corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, corners_img[1, :] > 0)
+ visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+ # True if a corner is at least 0.1 meter in front of the camera.
+ in_front = corners_3d[2, :] > 0.1
+
+ if any(visible) and not all(visible) and all(in_front):
+ return True
+ else:
+ return False
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ id=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.token in id:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+ ori_eval_boxes: EvalBoxes,
+ visibility=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.visibility == visibility:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After visibility based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_by_sample_token(
+ ori_eval_boxes,
+ valid_sample_tokens=[],
+ verbose=False):
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ for sample_token in eval_boxes.sample_tokens:
+ if sample_token not in valid_sample_tokens:
+ eval_boxes.boxes.pop(sample_token)
+ return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. basedon overlap .
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ sample_record = nusc.get('sample', sample_token)
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ count = 0
+ for cam in cams:
+ '''
+ copy-paste form nuscens
+ '''
+ sample_data_token = sample_record['data'][cam]
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get(
+ 'calibrated_sensor',
+ sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ new_box = Box(
+ box.translation,
+ box.size,
+ Quaternion(
+ box.rotation),
+ name=box.detection_name,
+ token='')
+
+ # Move box to ego vehicle coord system.
+ new_box.translate(-np.array(pose_record['translation']))
+ new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ new_box.translate(-np.array(cs_record['translation']))
+ new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if center_in_image(
+ new_box,
+ cam_intrinsic,
+ imsize,
+ vis_level=BoxVisibility.ANY):
+ count += 1
+ # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ # count += 1
+
+ if count > 1:
+ with open('center_overlap.txt', 'a') as f:
+ try:
+ f.write(box.token + '\n')
+ except BaseException:
+ pass
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ verbose = True
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+class MotionEval(NuScenesEval):
+ """
+ Dummy class for backward-compatibility. Same as DetectionEval.
+ """
+
+ def __init__(self,
+ nusc: NuScenes,
+ config: DetectionConfig,
+ result_path: str,
+ eval_set: str,
+ output_dir: str = None,
+ verbose: bool = True,
+ overlap_test=False,
+ eval_mask=False,
+ data_infos=None,
+ category_convert_type='motion_category',
+ ):
+ """
+ Initialize a DetectionEval object.
+ :param nusc: A NuScenes object.
+ :param config: A DetectionConfig object.
+ :param result_path: Path of the nuScenes JSON result file.
+ :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+ :param output_dir: Folder to save plots and results to.
+ :param verbose: Whether to print to stdout.
+ """
+
+ self.nusc = nusc
+ self.result_path = result_path
+ self.eval_set = eval_set
+ self.output_dir = output_dir
+ self.verbose = verbose
+ self.cfg = config
+ self.overlap_test = overlap_test
+ self.eval_mask = eval_mask
+ self.data_infos = data_infos
+ # Check result file exists.
+ assert os.path.exists(
+ result_path), 'Error: The result file does not exist!'
+
+ # Make dirs.
+ self.plot_dir = os.path.join(self.output_dir, 'plots')
+ if not os.path.isdir(self.output_dir):
+ os.makedirs(self.output_dir)
+ if not os.path.isdir(self.plot_dir):
+ os.makedirs(self.plot_dir)
+
+ # Load data.
+ if verbose:
+ print('Initializing nuScenes detection evaluation')
+ self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionMotionBox,
+ verbose=verbose, category_convert_type=category_convert_type)
+ self.gt_boxes = load_gt(
+ self.nusc,
+ self.eval_set,
+ DetectionMotionBox_modified,
+ verbose=verbose,
+ category_convert_type=category_convert_type)
+
+ assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+ "Samples in split doesn't match samples in predictions."
+
+ # Add center distances.
+ self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+ self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+ # Filter boxes (distance, points per box, etc.).
+
+ if verbose:
+ print('Filtering predictions')
+ self.pred_boxes = filter_eval_boxes(
+ nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+ if verbose:
+ print('Filtering ground truth annotations')
+ self.gt_boxes = filter_eval_boxes(
+ nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+ if self.overlap_test:
+ self.pred_boxes = filter_eval_boxes_by_overlap(
+ self.nusc, self.pred_boxes)
+
+ self.gt_boxes = filter_eval_boxes_by_overlap(
+ self.nusc, self.gt_boxes, verbose=True)
+
+ self.all_gt = copy.deepcopy(self.gt_boxes)
+ self.all_preds = copy.deepcopy(self.pred_boxes)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+ self.index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ self.index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ self.index_map[sample['token']] = index
+ index += 1
+
+ def update_gt(self, type_='vis', visibility='1', index=1):
+ if type_ == 'vis':
+ self.visibility_test = True
+ if self.visibility_test:
+ '''[{'description': 'visibility of whole object is between 0 and 40%',
+ 'token': '1',
+ 'level': 'v0-40'},
+ {'description': 'visibility of whole object is between 40 and 60%',
+ 'token': '2',
+ 'level': 'v40-60'},
+ {'description': 'visibility of whole object is between 60 and 80%',
+ 'token': '3',
+ 'level': 'v60-80'},
+ {'description': 'visibility of whole object is between 80 and 100%',
+ 'token': '4',
+ 'level': 'v80-100'}]'''
+
+ self.gt_boxes = filter_eval_boxes_by_visibility(
+ self.all_gt, visibility, verbose=True)
+
+ elif type_ == 'ord':
+
+ valid_tokens = [
+ key for (
+ key,
+ value) in self.index_map.items() if value == index]
+ # from IPython import embed
+ # embed()
+ self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+ self.pred_boxes = filter_by_sample_token(
+ self.all_preds, valid_tokens)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+ def evaluate(self) -> Tuple[DetectionMotionMetrics,
+ DetectionMotionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMotionMetricDataList()
+
+ # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+ # self.cfg.dist_ths = [0.3]
+ # self.cfg.dist_fcn_callable
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md, _, _, _ = accumulate(
+ self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMotionMetrics(self.cfg)
+
+ traj_metrics = {}
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(
+ metric_data,
+ self.cfg.min_recall,
+ self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(
+ class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in [
+ 'attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ if metric_name in TP_TRAJ_METRICS:
+ if class_name not in traj_metrics:
+ traj_metrics[class_name] = {}
+ traj_metrics[class_name][metric_name] = tp
+ metrics.add_label_tp(class_name, metric_name, tp)
+ print_traj_metrics(traj_metrics)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def evaluate_motion(
+ self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ self.cfg.dist_ths = [1.0]
+ self.cfg.dist_th_tp = 1.0 # center dist for detection
+ traj_dist_th = 2.0 # FDE for traj
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMotionMetricDataList()
+
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md, _, _, _ = accumulate_motion(
+ self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, traj_fde, dist_th, traj_dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMotionMetrics(self.cfg)
+
+ traj_metrics = {}
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(
+ metric_data,
+ self.cfg.min_recall,
+ self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(
+ class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in [
+ 'attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ if metric_name in TP_TRAJ_METRICS:
+ if class_name not in traj_metrics:
+ traj_metrics[class_name] = {}
+ traj_metrics[class_name][metric_name] = tp
+ metrics.add_label_tp(class_name, metric_name, tp)
+ print_traj_metrics(traj_metrics)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def evaluate_epa(
+ self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ self.cfg.dist_ths = [2.0]
+ self.cfg.dist_th_tp = 2.0 # center dist for detection
+ traj_dist_th = 2.0 # FDE for traj
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMotionMetricDataList()
+
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md, N_det_tp, N_det_fp, N_det_gt = accumulate(
+ self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+ md, N_det_traj_tp, N_det_traj_fp, N_det_traj_gt = accumulate_motion(
+ self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, traj_fde, dist_th, traj_dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+ EPA = (N_det_traj_tp - 0.5 * N_det_fp) / (N_det_gt + 1e-5)
+ print(N_det_traj_tp, N_det_fp, N_det_gt)
+ print('EPA ', class_name, EPA)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMotionMetrics(self.cfg)
+
+ traj_metrics = {}
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(
+ metric_data,
+ self.cfg.min_recall,
+ self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(
+ class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in [
+ 'attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ if metric_name in TP_TRAJ_METRICS:
+ if class_name not in traj_metrics:
+ traj_metrics[class_name] = {}
+ traj_metrics[class_name][metric_name] = tp
+ metrics.add_label_tp(class_name, metric_name, tp)
+ print_traj_metrics(traj_metrics)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def main(self,
+ plot_examples: int = 0,
+ render_curves: bool = True,
+ eval_mode: str = 'standard') -> Dict[str, Any]:
+ """
+ Main function that loads the evaluation code, visualizes samples, runs the evaluation and renders stat plots.
+ :param plot_examples: How many example visualizations to write to disk.
+ :param render_curves: Whether to render PR and TP curves to disk.
+ :return: A dict that stores the high-level metrics and meta data.
+ """
+ if plot_examples > 0:
+ # Select a random but fixed subset to plot.
+ random.seed(42)
+ sample_tokens = list(self.sample_tokens)
+ random.shuffle(sample_tokens)
+ sample_tokens = sample_tokens[:plot_examples]
+
+ # Visualize samples.
+ example_dir = os.path.join(self.output_dir, 'examples')
+ if not os.path.isdir(example_dir):
+ os.mkdir(example_dir)
+ for sample_token in sample_tokens:
+ visualize_sample(self.nusc,
+ sample_token,
+ self.gt_boxes if self.eval_set != 'test' else EvalBoxes(),
+ # Don't render test GT.
+ self.pred_boxes,
+ eval_range=max(self.cfg.class_range.values()),
+ savepath=os.path.join(example_dir, '{}.png'.format(sample_token)))
+
+ # Run evaluation.
+ if eval_mode == 'motion_map':
+ metrics, metric_data_list = self.evaluate_motion()
+ elif eval_mode == 'standard':
+ metrics, metric_data_list = self.evaluate()
+ elif eval_mode == 'epa':
+ metrics, metric_data_list = self.evaluate_epa()
+ else:
+ raise NotImplementedError
+ # Render PR and TP curves.
+ if render_curves:
+ self.render(metrics, metric_data_list)
+
+ # Dump the metric data, meta and metrics to disk.
+ if self.verbose:
+ print('Saving metrics to: %s' % self.output_dir)
+ metrics_summary = metrics.serialize()
+ metrics_summary['meta'] = self.meta.copy()
+ with open(os.path.join(self.output_dir, 'metrics_summary.json'), 'w') as f:
+ json.dump(metrics_summary, f, indent=2)
+ with open(os.path.join(self.output_dir, 'metrics_details.json'), 'w') as f:
+ json.dump(metric_data_list.serialize(), f, indent=2)
+
+ # Print high-level metrics.
+ print('mAP: %.4f' % (metrics_summary['mean_ap']))
+ err_name_mapping = {
+ 'trans_err': 'mATE',
+ 'scale_err': 'mASE',
+ 'orient_err': 'mAOE',
+ 'vel_err': 'mAVE',
+ 'attr_err': 'mAAE'
+ }
+ for tp_name, tp_val in metrics_summary['tp_errors'].items():
+ print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+ print('NDS: %.4f' % (metrics_summary['nd_score']))
+ print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+ # Print per-class metrics.
+ print()
+ print('Per-class results:')
+ print('Object Class\tAP\tATE\tASE\tAOE\tAVE\tAAE')
+ class_aps = metrics_summary['mean_dist_aps']
+ class_tps = metrics_summary['label_tp_errors']
+ for class_name in class_aps.keys():
+ print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+ % (class_name, class_aps[class_name],
+ class_tps[class_name]['trans_err'],
+ class_tps[class_name]['scale_err'],
+ class_tps[class_name]['orient_err'],
+ class_tps[class_name]['vel_err'],
+ class_tps[class_name]['attr_err']))
+
+ return metrics_summary
+
+ def render(self, metrics: DetectionMetrics,
+ md_list: DetectionMetricDataList) -> None:
+ """
+ Renders various PR and TP curves.
+ :param metrics: DetectionMetrics instance.
+ :param md_list: DetectionMetricDataList instance.
+ """
+ if self.verbose:
+ print('Rendering PR and TP curves')
+
+ def savepath(name):
+ return os.path.join(self.plot_dir, name + '.pdf')
+
+ summary_plot(
+ md_list,
+ metrics,
+ min_precision=self.cfg.min_precision,
+ min_recall=self.cfg.min_recall,
+ dist_th_tp=self.cfg.dist_th_tp,
+ savepath=savepath('summary'))
+
+ for detection_name in self.cfg.class_names:
+ class_pr_curve(
+ md_list,
+ metrics,
+ detection_name,
+ self.cfg.min_precision,
+ self.cfg.min_recall,
+ savepath=savepath(
+ detection_name +
+ '_pr'))
+
+ class_tp_curve(
+ md_list,
+ metrics,
+ detection_name,
+ self.cfg.min_recall,
+ self.cfg.dist_th_tp,
+ savepath=savepath(
+ detection_name +
+ '_tp'))
+
+ for dist_th in self.cfg.dist_ths:
+ dist_pr_curve(
+ md_list,
+ metrics,
+ dist_th,
+ self.cfg.min_precision,
+ self.cfg.min_recall,
+ savepath=savepath(
+ 'dist_pr_' +
+ str(dist_th)))
+
+
+def print_traj_metrics(metrics):
+ class_names = metrics.keys()
+ x = PrettyTable()
+ x.field_names = ["class names"] + TP_TRAJ_METRICS
+ for class_name in metrics.keys():
+ row_data = [class_name]
+ for m in TP_TRAJ_METRICS:
+ row_data.append('%.4f' % metrics[class_name][m])
+ x.add_row(row_data)
+ print(x)
+
+
+if __name__ == "__main__":
+
+ # Settings.
+ parser = argparse.ArgumentParser(
+ description='Evaluate nuScenes detection results.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ 'result_path',
+ type=str,
+ help='The submission as a JSON file.')
+ parser.add_argument(
+ '--output_dir',
+ type=str,
+ default='~/nuscenes-metrics',
+ help='Folder to store result metrics, graphs and example visualizations.')
+ parser.add_argument(
+ '--eval_set',
+ type=str,
+ default='val',
+ help='Which dataset split to evaluate on, train, val or test.')
+ parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+ help='Default nuScenes data directory.')
+ parser.add_argument(
+ '--version',
+ type=str,
+ default='v1.0-trainval',
+ help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+ parser.add_argument(
+ '--config_path',
+ type=str,
+ default='',
+ help='Path to the configuration file.'
+ 'If no path given, the CVPR 2019 configuration will be used.')
+ parser.add_argument(
+ '--plot_examples',
+ type=int,
+ default=0,
+ help='How many example visualizations to write to disk.')
+ parser.add_argument('--render_curves', type=int, default=1,
+ help='Whether to render PR and TP curves to disk.')
+ parser.add_argument('--verbose', type=int, default=1,
+ help='Whether to print to stdout.')
+ args = parser.parse_args()
+
+ result_path_ = os.path.expanduser(args.result_path)
+ output_dir_ = os.path.expanduser(args.output_dir)
+ eval_set_ = args.eval_set
+ dataroot_ = args.dataroot
+ version_ = args.version
+ config_path = args.config_path
+ plot_examples_ = args.plot_examples
+ render_curves_ = bool(args.render_curves)
+ verbose_ = bool(args.verbose)
+
+ if config_path == '':
+ cfg_ = config_factory('detection_cvpr_2019')
+ else:
+ with open(config_path, 'r') as _f:
+ cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+ nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+ nusc_eval = MotionEval(
+ nusc_,
+ config=cfg_,
+ result_path=result_path_,
+ eval_set=eval_set_,
+ output_dir=output_dir_,
+ verbose=verbose_)
+ for vis in ['1', '2', '3', '4']:
+ nusc_eval.update_gt(type_='vis', visibility=vis)
+ print(f'================ {vis} ===============')
+ nusc_eval.main(
+ plot_examples=plot_examples_,
+ render_curves=render_curves_)
diff --git a/mmcv/datasets/lyft_dataset.py b/mmcv/datasets/lyft_dataset.py
new file mode 100644
index 0000000..34707ee
--- /dev/null
+++ b/mmcv/datasets/lyft_dataset.py
@@ -0,0 +1,561 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+import pandas as pd
+import tempfile
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from lyft_dataset_sdk.utils.data_classes import Box as LyftBox
+from os import path as osp
+from pyquaternion import Quaternion
+
+from mmcv.core.evaluation.lyft_eval import lyft_eval
+from mmcv.datasets import DATASETS
+from mmcv.core import show_result
+# from mmcv.core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class LyftDataset(Custom3DDataset):
+ r"""Lyft Dataset.
+
+ This class serves as the API for experiments on the Lyft Dataset.
+
+ Please refer to
+ ``_
+ for data downloading.
+
+ Args:
+ ann_file (str): Path of annotation file.
+ pipeline (list[dict], optional): Pipeline used for data processing.
+ Defaults to None.
+ data_root (str): Path of dataset root.
+ classes (tuple[str], optional): Classes used in the dataset.
+ Defaults to None.
+ load_interval (int, optional): Interval of loading the dataset. It is
+ used to uniformly sample the dataset. Defaults to 1.
+ modality (dict, optional): Modality to specify the sensor data used
+ as input. Defaults to None.
+ box_type_3d (str, optional): Type of 3D box of this dataset.
+ Based on the `box_type_3d`, the dataset will encapsulate the box
+ to its original format then converted them to `box_type_3d`.
+ Defaults to 'LiDAR' in this dataset. Available options includes
+
+ - 'LiDAR': Box in LiDAR coordinates.
+ - 'Depth': Box in depth coordinates, usually for indoor dataset.
+ - 'Camera': Box in camera coordinates.
+ filter_empty_gt (bool, optional): Whether to filter empty GT.
+ Defaults to True.
+ test_mode (bool, optional): Whether the dataset is in test mode.
+ Defaults to False.
+ """ # noqa: E501
+ NameMapping = {
+ 'bicycle': 'bicycle',
+ 'bus': 'bus',
+ 'car': 'car',
+ 'emergency_vehicle': 'emergency_vehicle',
+ 'motorcycle': 'motorcycle',
+ 'other_vehicle': 'other_vehicle',
+ 'pedestrian': 'pedestrian',
+ 'truck': 'truck',
+ 'animal': 'animal'
+ }
+ DefaultAttribute = {
+ 'car': 'is_stationary',
+ 'truck': 'is_stationary',
+ 'bus': 'is_stationary',
+ 'emergency_vehicle': 'is_stationary',
+ 'other_vehicle': 'is_stationary',
+ 'motorcycle': 'is_stationary',
+ 'bicycle': 'is_stationary',
+ 'pedestrian': 'is_stationary',
+ 'animal': 'is_stationary'
+ }
+ CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+ 'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+ def __init__(self,
+ ann_file,
+ pipeline=None,
+ data_root=None,
+ classes=None,
+ load_interval=1,
+ modality=None,
+ box_type_3d='LiDAR',
+ filter_empty_gt=True,
+ test_mode=False):
+ self.load_interval = load_interval
+ super().__init__(
+ data_root=data_root,
+ ann_file=ann_file,
+ pipeline=pipeline,
+ classes=classes,
+ modality=modality,
+ box_type_3d=box_type_3d,
+ filter_empty_gt=filter_empty_gt,
+ test_mode=test_mode)
+
+ if self.modality is None:
+ self.modality = dict(
+ use_camera=False,
+ use_lidar=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False,
+ )
+
+ def load_annotations(self, ann_file):
+ """Load annotations from ann_file.
+
+ Args:
+ ann_file (str): Path of the annotation file.
+
+ Returns:
+ list[dict]: List of annotations sorted by timestamps.
+ """
+ data = mmcv.load(ann_file)
+ data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+ data_infos = data_infos[::self.load_interval]
+ self.metadata = data['metadata']
+ self.version = self.metadata['version']
+ return data_infos
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): sample index
+ - pts_filename (str): filename of point clouds
+ - sweeps (list[dict]): infos of sweeps
+ - timestamp (float): sample timestamp
+ - img_filename (str, optional): image filename
+ - lidar2img (list[np.ndarray], optional): transformations \
+ from lidar to different cameras
+ - ann_info (dict): annotation info
+ """
+ info = self.data_infos[index]
+
+ # standard protocal modified from SECOND.Pytorch
+ input_dict = dict(
+ sample_idx=info['token'],
+ pts_filename=info['lidar_path'],
+ sweeps=info['sweeps'],
+ timestamp=info['timestamp'] / 1e6,
+ )
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ for cam_type, cam_info in info['cams'].items():
+ image_paths.append(cam_info['data_path'])
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+ lidar2cam_t = cam_info[
+ 'sensor2lidar_translation'] @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ intrinsic = cam_info['cam_intrinsic']
+ viewpad = np.eye(4)
+ viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ lidar2img_rts.append(lidar2img_rt)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ ))
+
+ if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+
+ return input_dict
+
+ def get_ann_info(self, index):
+ """Get annotation info according to the given index.
+
+ Args:
+ index (int): Index of the annotation data to get.
+
+ Returns:
+ dict: Annotation information consists of the following keys:
+
+ - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+ 3D ground truth bboxes.
+ - gt_labels_3d (np.ndarray): Labels of ground truths.
+ - gt_names (list[str]): Class names of ground truths.
+ """
+ info = self.data_infos[index]
+ gt_bboxes_3d = info['gt_boxes']
+ gt_names_3d = info['gt_names']
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ if 'gt_shape' in info:
+ gt_shape = info['gt_shape']
+ gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1)
+
+ # the lyft box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ )
+ return anns_results
+
+ def _format_bbox(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+
+ Returns:
+ str: Path of the output json file.
+ """
+ lyft_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+ annos = []
+ boxes = output_to_lyft_box(det)
+ sample_token = self.data_infos[sample_id]['token']
+ boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes)
+ for i, box in enumerate(boxes):
+ name = mapped_class_names[box.label]
+ lyft_anno = dict(
+ sample_token=sample_token,
+ translation=box.center.tolist(),
+ size=box.wlh.tolist(),
+ rotation=box.orientation.elements.tolist(),
+ name=name,
+ score=box.score)
+ annos.append(lyft_anno)
+ lyft_annos[sample_token] = annos
+ lyft_submissions = {
+ 'meta': self.modality,
+ 'results': lyft_annos,
+ }
+
+ mmcv.mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_lyft.json')
+ print('Results writes to', res_path)
+ mmcv.dump(lyft_submissions, res_path)
+ return res_path
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in Lyft protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+ lyft = Lyft(
+ data_path=osp.join(self.data_root, self.version),
+ json_path=osp.join(self.data_root, self.version, self.version),
+ verbose=True)
+ eval_set_map = {
+ 'v1.01-train': 'val',
+ }
+ metrics = lyft_eval(lyft, self.data_root, result_path,
+ eval_set_map[self.version], output_dir, logger)
+
+ # record metrics
+ detail = dict()
+ metric_prefix = f'{result_name}_Lyft'
+
+ for i, name in enumerate(metrics['class_names']):
+ AP = float(metrics['mAPs_cate'][i])
+ detail[f'{metric_prefix}/{name}_AP'] = AP
+
+ detail[f'{metric_prefix}/mAP'] = metrics['Final mAP']
+ return detail
+
+ def format_results(self, results, jsonfile_prefix=None, csv_savepath=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ csv_savepath (str | None): The path for saving csv files.
+ It includes the file path and the csv filename,
+ e.g., "a/b/filename.csv". If not specified,
+ the result will not be converted to csv file.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ assert len(results) == len(self), (
+ 'The length of results is not equal to the dataset len: {} != {}'.
+ format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ # currently the output prediction results could be in two formats
+ # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+ # 2. list of dict('pts_bbox' or 'img_bbox':
+ # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+ # this is a workaround to enable evaluation of both formats on Lyft
+ # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+ if csv_savepath is not None:
+ self.json2csv(result_files['pts_bbox'], csv_savepath)
+ return result_files, tmp_dir
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ csv_savepath=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in Lyft protocol.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ csv_savepath (str | None): The path for saving csv files.
+ It includes the file path and the csv filename,
+ e.g., "a/b/filename.csv". If not specified,
+ the result will not be converted to csv file.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict[str, float]: Evaluation results.
+ """
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix,
+ csv_savepath)
+
+ if isinstance(result_files, dict):
+ results_dict = dict()
+ for name in result_names:
+ print(f'Evaluating bboxes of {name}')
+ ret_dict = self._evaluate_single(result_files[name])
+ results_dict.update(ret_dict)
+ elif isinstance(result_files, str):
+ results_dict = self._evaluate_single(result_files)
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+ return results_dict
+
+ def _build_default_pipeline(self):
+ """Build the default pipeline for this dataset."""
+ pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=dict(backend='disk')),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=dict(backend='disk')),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=self.CLASSES,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ]
+ return Compose(pipeline)
+
+ def show(self, results, out_dir, show=True, pipeline=None):
+ """Results visualization.
+
+ Args:
+ results (list[dict]): List of bounding boxes results.
+ out_dir (str): Output directory of visualization result.
+ show (bool): Visualize the results online.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+ """
+ assert out_dir is not None, 'Expect out_dir, got none.'
+ pipeline = self._get_pipeline(pipeline)
+ for i, result in enumerate(results):
+ if 'pts_bbox' in result.keys():
+ result = result['pts_bbox']
+ data_info = self.data_infos[i]
+ pts_path = data_info['lidar_path']
+ file_name = osp.split(pts_path)[-1].split('.')[0]
+ points = self._extract_data(i, pipeline, 'points').numpy()
+ points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+ Coord3DMode.DEPTH)
+ inds = result['scores_3d'] > 0.1
+ gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+ show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+ Box3DMode.DEPTH)
+ pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
+ show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+ Box3DMode.DEPTH)
+ show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+ file_name, show)
+
+ def json2csv(self, json_path, csv_savepath):
+ """Convert the json file to csv format for submission.
+
+ Args:
+ json_path (str): Path of the result json file.
+ csv_savepath (str): Path to save the csv file.
+ """
+ results = mmcv.load(json_path)['results']
+ sample_list_path = osp.join(self.data_root, 'sample_submission.csv')
+ data = pd.read_csv(sample_list_path)
+ Id_list = list(data['Id'])
+ pred_list = list(data['PredictionString'])
+ cnt = 0
+ print('Converting the json to csv...')
+ for token in results.keys():
+ cnt += 1
+ predictions = results[token]
+ prediction_str = ''
+ for i in range(len(predictions)):
+ prediction_str += \
+ str(predictions[i]['score']) + ' ' + \
+ str(predictions[i]['translation'][0]) + ' ' + \
+ str(predictions[i]['translation'][1]) + ' ' + \
+ str(predictions[i]['translation'][2]) + ' ' + \
+ str(predictions[i]['size'][0]) + ' ' + \
+ str(predictions[i]['size'][1]) + ' ' + \
+ str(predictions[i]['size'][2]) + ' ' + \
+ str(Quaternion(list(predictions[i]['rotation']))
+ .yaw_pitch_roll[0]) + ' ' + \
+ predictions[i]['name'] + ' '
+ prediction_str = prediction_str[:-1]
+ idx = Id_list.index(token)
+ pred_list[idx] = prediction_str
+ df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list})
+ mmcv.mkdir_or_exist(os.path.dirname(csv_savepath))
+ df.to_csv(csv_savepath, index=False)
+
+
+def output_to_lyft_box(detection):
+ """Convert the output to the box class in the Lyft.
+
+ Args:
+ detection (dict): Detection results.
+
+ Returns:
+ list[:obj:`LyftBox`]: List of standard LyftBoxes.
+ """
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+ # TODO: check whether this is necessary
+ # with dir_offset & dir_limit in the head
+ box_yaw = -box_yaw - np.pi / 2
+
+ box_list = []
+ for i in range(len(box3d)):
+ quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ box = LyftBox(
+ box_gravity_center[i],
+ box_dims[i],
+ quat,
+ label=labels[i],
+ score=scores[i])
+ box_list.append(box)
+ return box_list
+
+
+def lidar_lyft_box_to_global(info, boxes):
+ """Convert the box from ego to global coordinate.
+
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes.
+
+ Returns:
+ list: List of standard LyftBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ for box in boxes:
+ # Move box to ego vehicle coord system
+ box.rotate(Quaternion(info['lidar2ego_rotation']))
+ box.translate(np.array(info['lidar2ego_translation']))
+ # Move box to global coord system
+ box.rotate(Quaternion(info['ego2global_rotation']))
+ box.translate(np.array(info['ego2global_translation']))
+ box_list.append(box)
+ return box_list
\ No newline at end of file
diff --git a/mmcv/datasets/map_utils/mean_ap.py b/mmcv/datasets/map_utils/mean_ap.py
new file mode 100644
index 0000000..9b3a49b
--- /dev/null
+++ b/mmcv/datasets/map_utils/mean_ap.py
@@ -0,0 +1,390 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+from shapely.geometry import LineString, Polygon
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+import json
+from os import path as osp
+import os
+from functools import partial
+from .tpfp import tpfp_gen, custom_tpfp_gen
+from mmcv.fileio.io import dump,load
+
+def average_precision(recalls, precisions, mode='area'):
+ """Calculate average precision (for single or multiple scales).
+
+ Args:
+ recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+ precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+ mode (str): 'area' or '11points', 'area' means calculating the area
+ under precision-recall curve, '11points' means calculating
+ the average precision of recalls at [0, 0.1, ..., 1]
+
+ Returns:
+ float or ndarray: calculated average precision
+ """
+ no_scale = False
+ if recalls.ndim == 1:
+ no_scale = True
+ recalls = recalls[np.newaxis, :]
+ precisions = precisions[np.newaxis, :]
+ assert recalls.shape == precisions.shape and recalls.ndim == 2
+ num_scales = recalls.shape[0]
+ ap = np.zeros(num_scales, dtype=np.float32)
+ if mode == 'area':
+ zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+ ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+ mrec = np.hstack((zeros, recalls, ones))
+ mpre = np.hstack((zeros, precisions, zeros))
+ for i in range(mpre.shape[1] - 1, 0, -1):
+ mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+ for i in range(num_scales):
+ ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+ ap[i] = np.sum(
+ (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+ elif mode == '11points':
+ for i in range(num_scales):
+ for thr in np.arange(0, 1 + 1e-3, 0.1):
+ precs = precisions[i, recalls[i, :] >= thr]
+ prec = precs.max() if precs.size > 0 else 0
+ ap[i] += prec
+ ap /= 11
+ else:
+ raise ValueError(
+ 'Unrecognized mode, only "area" and "11points" are supported')
+ if no_scale:
+ ap = ap[0]
+ return ap
+
+def get_cls_results(gen_results,
+ annotations,
+ num_sample=100,
+ num_pred_pts_per_instance=30,
+ eval_use_same_gt_sample_num_flag=False,
+ class_id=0,
+ fix_interval=False):
+ """Get det results and gt information of a certain class.
+
+ Args:
+ gen_results (list[list]): Same as `eval_map()`.
+ annotations (list[dict]): Same as `eval_map()`.
+ class_id (int): ID of a specific class.
+
+ Returns:
+ tuple[list[np.ndarray]]: detected bboxes, gt bboxes
+ """
+ # if len(gen_results) == 0 or
+
+ cls_gens, cls_scores = [], []
+ for res in gen_results['vectors']:
+ if res['type'] == class_id:
+ if len(res['pts']) < 2:
+ continue
+ if not eval_use_same_gt_sample_num_flag:
+ sampled_points = np.array(res['pts'])
+ else:
+ line = res['pts']
+ line = LineString(line)
+
+ if fix_interval:
+ distances = list(np.arange(1., line.length, 1.))
+ distances = [0,] + distances + [line.length,]
+ sampled_points = np.array([list(line.interpolate(distance).coords)
+ for distance in distances]).reshape(-1, 2)
+ else:
+ distances = np.linspace(0, line.length, num_sample)
+ sampled_points = np.array([list(line.interpolate(distance).coords)
+ for distance in distances]).reshape(-1, 2)
+
+ cls_gens.append(sampled_points)
+ cls_scores.append(res['confidence_level'])
+ num_res = len(cls_gens)
+ if num_res > 0:
+ cls_gens = np.stack(cls_gens).reshape(num_res,-1)
+ cls_scores = np.array(cls_scores)[:,np.newaxis]
+ cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1)
+ # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+ else:
+ if not eval_use_same_gt_sample_num_flag:
+ cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1))
+ else:
+ cls_gens = np.zeros((0,num_sample*2+1))
+ # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+
+ cls_gts = []
+ for ann in annotations['vectors']:
+ if ann['type'] == class_id:
+ # line = ann['pts'] + np.array((1,1)) # for hdmapnet
+ line = ann['pts']
+ # line = ann['pts'].cumsum(0)
+ line = LineString(line)
+ distances = np.linspace(0, line.length, num_sample)
+ sampled_points = np.array([list(line.interpolate(distance).coords)
+ for distance in distances]).reshape(-1, 2)
+
+ cls_gts.append(sampled_points)
+ num_gts = len(cls_gts)
+ if num_gts > 0:
+ cls_gts = np.stack(cls_gts).reshape(num_gts,-1)
+ else:
+ cls_gts = np.zeros((0,num_sample*2))
+ return cls_gens, cls_gts
+ # ones = np.ones((num_gts,1))
+ # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1)
+ # return tmp_cls_gens, cls_gts
+
+def format_res_gt_by_classes(result_path,
+ gen_results,
+ annotations,
+ cls_names=None,
+ num_pred_pts_per_instance=30,
+ eval_use_same_gt_sample_num_flag=False,
+ pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0],
+ nproc=24):
+ assert cls_names is not None
+ timer = mmcv.Timer()
+ num_fixed_sample_pts = 100
+ fix_interval = False
+ print('results path: {}'.format(result_path))
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+ assert len(gen_results) == len(annotations)
+
+ pool = Pool(nproc)
+ cls_gens, cls_gts = {}, {}
+ print('Formatting ...')
+ formatting_file = 'cls_formatted.pkl'
+ formatting_file = osp.join(output_dir,formatting_file)
+
+ # for vis
+ if False:
+ from PIL import Image
+ import matplotlib.pyplot as plt
+ from matplotlib import transforms
+ from matplotlib.patches import Rectangle
+
+ show_dir = osp.join(output_dir,'vis_json')
+ mmcv.mkdir_or_exist(osp.abspath(show_dir))
+ # import pdb;pdb.set_trace()
+ car_img = Image.open('./figs/lidar_car.png')
+ colors_plt = ['r', 'b', 'g']
+ for i in range(20):
+
+ plt.figure(figsize=(2, 4))
+ plt.xlim(pc_range[0], pc_range[3])
+ plt.ylim(pc_range[1], pc_range[4])
+ plt.axis('off')
+
+ for line in gen_results[i]['vectors']:
+ l = np.array(line['pts'])
+ plt.plot(l[:,0],l[:,1],'-',
+ # color=colors[line['type']]
+ color = 'red',
+ )
+
+ for line in annotations[i]['vectors']:
+ # l = np.array(line['pts']) + np.array((1,1))
+ l = np.array(line['pts'])
+ # l = line['pts']
+ plt.plot(l[:,0],l[:,1],'-',
+ # color=colors[line['type']],
+ color = 'blue',
+ )
+ plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5])
+ map_path = osp.join(show_dir, 'COMPARE_MAP_{}.jpg'.format(i))
+ plt.savefig(map_path, bbox_inches='tight', dpi=400)
+ plt.close()
+
+ for i, clsname in enumerate(cls_names):
+
+ gengts = pool.starmap(
+ partial(get_cls_results, num_sample=num_fixed_sample_pts,
+ num_pred_pts_per_instance=num_pred_pts_per_instance,
+ eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval),
+ zip(list(gen_results.values()), annotations))
+ # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval),
+ # zip(gen_results, annotations))
+ # import pdb;pdb.set_trace()
+ gens, gts = tuple(zip(*gengts))
+ cls_gens[clsname] = gens
+ cls_gts[clsname] = gts
+
+ dump([cls_gens, cls_gts],formatting_file)
+ print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file))
+ pool.close()
+ return cls_gens, cls_gts
+
+def eval_map(gen_results,
+ annotations,
+ cls_gens,
+ cls_gts,
+ threshold=0.5,
+ cls_names=None,
+ logger=None,
+ tpfp_fn=None,
+ pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0],
+ metric=None,
+ num_pred_pts_per_instance=30,
+ nproc=24):
+ timer = mmcv.Timer()
+ pool = Pool(nproc)
+
+ eval_results = []
+
+ for i, clsname in enumerate(cls_names):
+
+ # get gt and det bboxes of this class
+ cls_gen = cls_gens[clsname]
+ cls_gt = cls_gts[clsname]
+ # choose proper function according to datasets to compute tp and fp
+ # XXX
+ # func_name = cls2func[clsname]
+ # tpfp_fn = tpfp_fn_dict[tpfp_fn_name]
+ tpfp_fn = custom_tpfp_gen
+ # Trick for serialized
+ # only top-level function can be serized
+ # somehow use partitial the return function is defined
+ # at the top level.
+
+ # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric)
+ # import pdb; pdb.set_trace()
+ # TODO this is a hack
+ tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric)
+ args = []
+ # compute tp and fp for each image with multiple processes
+ tpfp = pool.starmap(
+ tpfp_fn,
+ zip(cls_gen, cls_gt, *args))
+ # import pdb;pdb.set_trace()
+ tp, fp = tuple(zip(*tpfp))
+
+
+
+ # map_results = map(
+ # tpfp_fn,
+ # cls_gen, cls_gt)
+ # tp, fp = tuple(map(list, zip(*map_results)))
+
+
+ # debug and testing
+ # for i in range(len(cls_gen)):
+ # # print(i)
+ # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+ # print(i)
+ # tpfp = (tpfp,)
+ # print(tpfp)
+ # i = 0
+ # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+ # import pdb; pdb.set_trace()
+
+ # XXX
+
+ num_gts = 0
+ for j, bbox in enumerate(cls_gt):
+ num_gts += bbox.shape[0]
+
+ # sort all det bboxes by score, also sort tp and fp
+ # import pdb;pdb.set_trace()
+ cls_gen = np.vstack(cls_gen)
+ num_dets = cls_gen.shape[0]
+ sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front
+ tp = np.hstack(tp)[sort_inds]
+ fp = np.hstack(fp)[sort_inds]
+
+ # calculate recall and precision with tp and fp
+ # num_det*num_res
+ tp = np.cumsum(tp, axis=0)
+ fp = np.cumsum(fp, axis=0)
+ eps = np.finfo(np.float32).eps
+ recalls = tp / np.maximum(num_gts, eps)
+ precisions = tp / np.maximum((tp + fp), eps)
+
+ # calculate AP
+ # if dataset != 'voc07' else '11points'
+ mode = 'area'
+ ap = average_precision(recalls, precisions, mode)
+ eval_results.append({
+ 'num_gts': num_gts,
+ 'num_dets': num_dets,
+ 'recall': recalls,
+ 'precision': precisions,
+ 'ap': ap
+ })
+ print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check())))
+ pool.close()
+ aps = []
+ for cls_result in eval_results:
+ if cls_result['num_gts'] > 0:
+ aps.append(cls_result['ap'])
+ mean_ap = np.array(aps).mean().item() if len(aps) else 0.0
+
+ print_map_summary(
+ mean_ap, eval_results, class_name=cls_names, logger=logger)
+
+ return mean_ap, eval_results
+
+
+
+def print_map_summary(mean_ap,
+ results,
+ class_name=None,
+ scale_ranges=None,
+ logger=None):
+ """Print mAP and results of each class.
+
+ A table will be printed to show the gts/dets/recall/AP of each class and
+ the mAP.
+
+ Args:
+ mean_ap (float): Calculated from `eval_map()`.
+ results (list[dict]): Calculated from `eval_map()`.
+ dataset (list[str] | str | None): Dataset name or dataset classes.
+ scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+ logger (logging.Logger | str | None): The way to print the mAP
+ summary. See `mmcv.utils.print_log()` for details. Default: None.
+ """
+
+ if logger == 'silent':
+ return
+
+ if isinstance(results[0]['ap'], np.ndarray):
+ num_scales = len(results[0]['ap'])
+ else:
+ num_scales = 1
+
+ if scale_ranges is not None:
+ assert len(scale_ranges) == num_scales
+
+ num_classes = len(results)
+
+ recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+ aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+ num_gts = np.zeros((num_scales, num_classes), dtype=int)
+ for i, cls_result in enumerate(results):
+ if cls_result['recall'].size > 0:
+ recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+ aps[:, i] = cls_result['ap']
+ num_gts[:, i] = cls_result['num_gts']
+
+ label_names = class_name
+
+ if not isinstance(mean_ap, list):
+ mean_ap = [mean_ap]
+
+ header = ['class', 'gts', 'dets', 'recall', 'ap']
+ for i in range(num_scales):
+ if scale_ranges is not None:
+ print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+ table_data = [header]
+ for j in range(num_classes):
+ row_data = [
+ label_names[j], num_gts[i, j], results[j]['num_dets'],
+ f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+ ]
+ table_data.append(row_data)
+ table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+ table = AsciiTable(table_data)
+ table.inner_footing_row_border = True
+ print_log('\n' + table.table, logger=logger)
diff --git a/mmcv/datasets/map_utils/struct.py b/mmcv/datasets/map_utils/struct.py
new file mode 100644
index 0000000..1f20fee
--- /dev/null
+++ b/mmcv/datasets/map_utils/struct.py
@@ -0,0 +1,438 @@
+import numpy as np
+import torch
+from shapely.geometry import LineString
+from mmcv.datasets.pipelines import to_tensor
+
+class LiDARInstanceLines(object):
+ """Line instance in LIDAR coordinates
+
+ """
+ def __init__(self,
+ instance_line_list,
+ sample_dist=1,
+ num_samples=250,
+ padding=False,
+ fixed_num=-1,
+ padding_value=-10000,
+ patch_size=None):
+ assert isinstance(instance_line_list, list)
+ assert patch_size is not None
+ if len(instance_line_list) != 0:
+ assert isinstance(instance_line_list[0], LineString)
+ self.patch_size = patch_size
+ self.max_x = self.patch_size[1] / 2
+ self.max_y = self.patch_size[0] / 2
+ self.sample_dist = sample_dist
+ self.num_samples = num_samples
+ self.padding = padding
+ self.fixed_num = fixed_num
+ self.padding_value = padding_value
+
+ self.instance_list = instance_line_list
+
+ @property
+ def start_end_points(self):
+ """
+ return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+ """
+ assert len(self.instance_list) != 0
+ instance_se_points_list = []
+ for instance in self.instance_list:
+ se_points = []
+ se_points.extend(instance.coords[0])
+ se_points.extend(instance.coords[-1])
+ instance_se_points_list.append(se_points)
+ instance_se_points_array = np.array(instance_se_points_list)
+ instance_se_points_tensor = to_tensor(instance_se_points_array)
+ instance_se_points_tensor = instance_se_points_tensor.to(
+ dtype=torch.float32)
+ instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+ instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+ instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+ instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+ return instance_se_points_tensor
+
+ @property
+ def bbox(self):
+ """
+ return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+ """
+ assert len(self.instance_list) != 0
+ instance_bbox_list = []
+ for instance in self.instance_list:
+ # bounds is bbox: [xmin, ymin, xmax, ymax]
+ instance_bbox_list.append(instance.bounds)
+ instance_bbox_array = np.array(instance_bbox_list)
+ instance_bbox_tensor = to_tensor(instance_bbox_array)
+ instance_bbox_tensor = instance_bbox_tensor.to(
+ dtype=torch.float32)
+ instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+ instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+ instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+ instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+ return instance_bbox_tensor
+
+ @property
+ def fixed_num_sampled_points(self):
+ """
+ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+ N means the num of instances
+ """
+ assert len(self.instance_list) != 0
+ instance_points_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ instance_points_list.append(sampled_points)
+ instance_points_array = np.array(instance_points_list)
+ instance_points_tensor = to_tensor(instance_points_array)
+ instance_points_tensor = instance_points_tensor.to(
+ dtype=torch.float32)
+ instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ return instance_points_tensor
+
+ @property
+ def fixed_num_sampled_points_ambiguity(self):
+ """
+ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+ N means the num of instances
+ """
+ assert len(self.instance_list) != 0
+ instance_points_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ instance_points_list.append(sampled_points)
+ instance_points_array = np.array(instance_points_list)
+ instance_points_tensor = to_tensor(instance_points_array)
+ instance_points_tensor = instance_points_tensor.to(
+ dtype=torch.float32)
+ instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ instance_points_tensor = instance_points_tensor.unsqueeze(1)
+ return instance_points_tensor
+
+ @property
+ def fixed_num_sampled_points_torch(self):
+ """
+ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+ N means the num of instances
+ """
+ assert len(self.instance_list) != 0
+ instance_points_list = []
+ for instance in self.instance_list:
+ # distances = np.linspace(0, instance.length, self.fixed_num)
+ # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ poly_pts = to_tensor(np.array(list(instance.coords)))
+ poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+ sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+ sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+ instance_points_list.append(sampled_pts)
+ # instance_points_array = np.array(instance_points_list)
+ # instance_points_tensor = to_tensor(instance_points_array)
+ instance_points_tensor = torch.stack(instance_points_list,dim=0)
+ instance_points_tensor = instance_points_tensor.to(
+ dtype=torch.float32)
+ instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ return instance_points_tensor
+
+ @property
+ def shift_fixed_num_sampled_points(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ fixed_num = fixed_num_pts.shape[0]
+ shift_pts_list = []
+ if is_poly:
+ # import pdb;pdb.set_trace()
+ for shift_right_i in range(fixed_num):
+ shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v1(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ pts_num = fixed_num_pts.shape[0]
+ shift_num = pts_num - 1
+ if is_poly:
+ pts_to_shift = fixed_num_pts[:-1,:]
+ shift_pts_list = []
+ if is_poly:
+ for shift_right_i in range(shift_num):
+ shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ if is_poly:
+ _, _, num_coords = shift_pts.shape
+ tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+ tmp_shift_pts[:,:-1,:] = shift_pts
+ tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+ shift_pts = tmp_shift_pts
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v2(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ assert len(self.instance_list) != 0
+ instances_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ poly_pts = np.array(list(instance.coords))
+ start_pts = poly_pts[0]
+ end_pts = poly_pts[-1]
+ is_poly = np.equal(start_pts, end_pts)
+ is_poly = is_poly.all()
+ shift_pts_list = []
+ pts_num, coords_num = poly_pts.shape
+ shift_num = pts_num - 1
+ final_shift_num = self.fixed_num - 1
+ if is_poly:
+ pts_to_shift = poly_pts[:-1,:]
+ for shift_right_i in range(shift_num):
+ shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+ pts_to_concat = shift_pts[0]
+ pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+ shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+ shift_instance = LineString(shift_pts)
+ shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ shift_pts_list.append(shift_sampled_points)
+ # import pdb;pdb.set_trace()
+ else:
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ flip_sampled_points = np.flip(sampled_points, axis=0)
+ shift_pts_list.append(sampled_points)
+ shift_pts_list.append(flip_sampled_points)
+
+ multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+ shifts_num,_,_ = multi_shifts_pts.shape
+
+ if shifts_num > final_shift_num:
+ index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+ multi_shifts_pts = multi_shifts_pts[index]
+
+ multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+ multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+ dtype=torch.float32)
+
+ multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ # if not is_poly:
+ if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+ padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+ multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+ instances_list.append(multi_shifts_pts_tensor)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v3(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ assert len(self.instance_list) != 0
+ instances_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ poly_pts = np.array(list(instance.coords))
+ start_pts = poly_pts[0]
+ end_pts = poly_pts[-1]
+ is_poly = np.equal(start_pts, end_pts)
+ is_poly = is_poly.all()
+ shift_pts_list = []
+ pts_num, coords_num = poly_pts.shape
+ shift_num = pts_num - 1
+ final_shift_num = self.fixed_num - 1
+ if is_poly:
+ pts_to_shift = poly_pts[:-1,:]
+ for shift_right_i in range(shift_num):
+ shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+ pts_to_concat = shift_pts[0]
+ pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+ shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+ shift_instance = LineString(shift_pts)
+ shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ shift_pts_list.append(shift_sampled_points)
+ flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+ for shift_right_i in range(shift_num):
+ shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+ pts_to_concat = shift_pts[0]
+ pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+ shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+ shift_instance = LineString(shift_pts)
+ shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ shift_pts_list.append(shift_sampled_points)
+ # import pdb;pdb.set_trace()
+ else:
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ flip_sampled_points = np.flip(sampled_points, axis=0)
+ shift_pts_list.append(sampled_points)
+ shift_pts_list.append(flip_sampled_points)
+
+ multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+ shifts_num,_,_ = multi_shifts_pts.shape
+ # import pdb;pdb.set_trace()
+ if shifts_num > 2*final_shift_num:
+ index = np.random.choice(shift_num, final_shift_num, replace=False)
+ flip0_shifts_pts = multi_shifts_pts[index]
+ flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+ multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+
+ multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+ multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+ dtype=torch.float32)
+
+ multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ # if not is_poly:
+ if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+ padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+ multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+ instances_list.append(multi_shifts_pts_tensor)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v4(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ pts_num = fixed_num_pts.shape[0]
+ shift_num = pts_num - 1
+ shift_pts_list = []
+ if is_poly:
+ pts_to_shift = fixed_num_pts[:-1,:]
+ for shift_right_i in range(shift_num):
+ shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+ flip_pts_to_shift = pts_to_shift.flip(0)
+ for shift_right_i in range(shift_num):
+ shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ if is_poly:
+ _, _, num_coords = shift_pts.shape
+ tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+ tmp_shift_pts[:,:-1,:] = shift_pts
+ tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+ shift_pts = tmp_shift_pts
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_torch(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ fixed_num = fixed_num_pts.shape[0]
+ shift_pts_list = []
+ if is_poly:
+ # import pdb;pdb.set_trace()
+ for shift_right_i in range(fixed_num):
+ shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
\ No newline at end of file
diff --git a/mmcv/datasets/map_utils/tpfp.py b/mmcv/datasets/map_utils/tpfp.py
new file mode 100644
index 0000000..a40ea1d
--- /dev/null
+++ b/mmcv/datasets/map_utils/tpfp.py
@@ -0,0 +1,363 @@
+import mmcv
+import numpy as np
+
+from mmcv.core.evaluation.bbox_overlaps import bbox_overlaps
+from .tpfp_chamfer import vec_iou, convex_iou, rbbox_iou, polyline_score, custom_polyline_score
+from shapely.geometry import LineString, Polygon
+# from vecmapnet_ops.ops.iou import convex_iou
+
+def tpfp_bbox(det_bboxes,
+ gt_bboxes,
+ gt_bbox_masks,
+ threshold=0.5):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ use_legacy_coordinate (bool): Whether to use coordinate system in
+ mmdet v1.x. which means width, height should be
+ calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+ Default: False.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+
+ num_dets = len(det_bboxes)
+ num_gts = len(gt_bboxes)
+
+ # tp and fp
+ tp = np.zeros((num_dets), dtype=np.float32)
+ fp = np.zeros((num_dets), dtype=np.float32)
+
+ # if there is no gt bboxes in this image, then all det bboxes
+ # within area range are false positives
+ # XXX
+ if num_gts == 0:
+ fp[...] = 1
+ return tp, fp
+
+ if num_dets == 0:
+ return tp, fp
+
+ # # distance matrix: n x m
+ bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2)
+ bbox_g = gt_bboxes.reshape(num_gts,-1,2)
+ bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2)
+ matrix = convex_iou(bbox_p,bbox_g,bbox_gm)
+
+ # for each det, the max iou with all gts
+ matrix_max = matrix.max(axis=1)
+ # for each det, which gt overlaps most with it
+ matrix_argmax = matrix.argmax(axis=1)
+ # sort all dets in descending order by scores
+ sort_inds = np.argsort(-det_bboxes[:, -1])
+
+ gt_covered = np.zeros(num_gts, dtype=bool)
+
+ # tp = 0 and fp = 0 means ignore this detected bbox,
+ for i in sort_inds:
+ if matrix_max[i] >= threshold:
+ matched_gt = matrix_argmax[i]
+ if not gt_covered[matched_gt]:
+ gt_covered[matched_gt] = True
+ tp[i] = 1
+ else:
+ fp[i] = 1
+ else:
+ fp[i] = 1
+
+ return tp, fp
+
+
+def tpfp_rbbox(det_bboxes,
+ gt_bboxes,
+ gt_bbox_masks,
+ threshold=0.5):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ use_legacy_coordinate (bool): Whether to use coordinate system in
+ mmdet v1.x. which means width, height should be
+ calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+ Default: False.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+
+ num_dets = len(det_bboxes)
+ num_gts = len(gt_bboxes)
+
+ # tp and fp
+ tp = np.zeros((num_dets), dtype=np.float32)
+ fp = np.zeros((num_dets), dtype=np.float32)
+
+ # if there is no gt bboxes in this image, then all det bboxes
+ # within area range are false positives
+ # XXX
+ if num_gts == 0:
+ fp[...] = 1
+ return tp, fp
+
+ if num_dets == 0:
+ return tp, fp
+
+ # # distance matrix: n x m
+ bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2)
+ bbox_g = gt_bboxes.reshape(num_gts,-1,2)
+ bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2)
+ matrix = rbbox_iou(bbox_p,bbox_g,bbox_gm)
+
+ # for each det, the max iou with all gts
+ matrix_max = matrix.max(axis=1)
+ # for each det, which gt overlaps most with it
+ matrix_argmax = matrix.argmax(axis=1)
+ # sort all dets in descending order by scores
+ sort_inds = np.argsort(-det_bboxes[:, -1])
+
+ gt_covered = np.zeros(num_gts, dtype=bool)
+
+ # tp = 0 and fp = 0 means ignore this detected bbox,
+ for i in sort_inds:
+ if matrix_max[i] >= threshold:
+ matched_gt = matrix_argmax[i]
+ if not gt_covered[matched_gt]:
+ gt_covered[matched_gt] = True
+ tp[i] = 1
+ else:
+ fp[i] = 1
+ else:
+ fp[i] = 1
+
+ return tp, fp
+
+
+def tpfp_det(det_bboxes,
+ gt_bboxes,
+ threshold=0.5):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ use_legacy_coordinate (bool): Whether to use coordinate system in
+ mmdet v1.x. which means width, height should be
+ calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+ Default: False.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+
+ num_dets = det_bboxes.shape[0]
+ num_gts = gt_bboxes.shape[0]
+
+ # tp and fp
+ tp = np.zeros((num_dets), dtype=np.float32)
+ fp = np.zeros((num_dets), dtype=np.float32)
+
+ # if there is no gt bboxes in this image, then all det bboxes
+ # within area range are false positives
+ # XXX
+ if num_gts == 0:
+ fp[...] = 1
+ return tp, fp
+
+ if num_dets == 0:
+ return tp, fp
+
+ # # distance matrix: n x m
+ matrix = vec_iou(
+ det_bboxes[:, :-1].reshape(num_dets,-1,2),
+ gt_bboxes.reshape(num_gts,-1,2))
+ # for each det, the max iou with all gts
+ matrix_max = matrix.max(axis=1)
+ # for each det, which gt overlaps most with it
+ matrix_argmax = matrix.argmax(axis=1)
+ # sort all dets in descending order by scores
+ sort_inds = np.argsort(-det_bboxes[:, -1])
+
+ gt_covered = np.zeros(num_gts, dtype=bool)
+
+ # tp = 0 and fp = 0 means ignore this detected bbox,
+ for i in sort_inds:
+ if matrix_max[i] >= threshold:
+ matched_gt = matrix_argmax[i]
+ if not gt_covered[matched_gt]:
+ gt_covered[matched_gt] = True
+ tp[i] = 1
+ else:
+ fp[i] = 1
+ else:
+ fp[i] = 1
+
+ return tp, fp
+
+
+def tpfp_gen(gen_lines,
+ gt_lines,
+ threshold=0.5,
+ metric='POR'):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ use_legacy_coordinate (bool): Whether to use coordinate system in
+ mmdet v1.x. which means width, height should be
+ calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+ Default: False.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+
+ num_gens = gen_lines.shape[0]
+ num_gts = gt_lines.shape[0]
+
+ # tp and fp
+ tp = np.zeros((num_gens), dtype=np.float32)
+ fp = np.zeros((num_gens), dtype=np.float32)
+
+ # if there is no gt bboxes in this image, then all det bboxes
+ # within area range are false positives
+ if num_gts == 0:
+ fp[...] = 1
+ return tp, fp
+
+ if num_gens == 0:
+ return tp, fp
+
+ gen_scores = gen_lines[:,-1] # n
+ # distance matrix: n x m
+
+ # matrix = custom_polyline_score(
+ # gen_lines[:,:-1].reshape(num_gens,-1,2),
+ # gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+
+ # TODO MAY bug here
+ matrix = polyline_score(
+ gen_lines[:,:-1].reshape(num_gens,-1,2),
+ gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+ # for each det, the max iou with all gts
+ matrix_max = matrix.max(axis=1)
+ # for each det, which gt overlaps most with it
+ matrix_argmax = matrix.argmax(axis=1)
+ # sort all dets in descending order by scores
+ sort_inds = np.argsort(-gen_scores)
+
+ gt_covered = np.zeros(num_gts, dtype=bool)
+
+ # tp = 0 and fp = 0 means ignore this detected bbox,
+ for i in sort_inds:
+ if matrix_max[i] >= threshold:
+ matched_gt = matrix_argmax[i]
+ if not gt_covered[matched_gt]:
+ gt_covered[matched_gt] = True
+ tp[i] = 1
+ else:
+ fp[i] = 1
+ else:
+ fp[i] = 1
+
+ return tp, fp
+
+
+def custom_tpfp_gen(gen_lines,
+ gt_lines,
+ threshold=0.5,
+ metric='chamfer'):
+ """Check if detected bboxes are true positive or false positive.
+
+ Args:
+ det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+ gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+ gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+ of shape (k, 4). Default: None
+ iou_thr (float): IoU threshold to be considered as matched.
+ Default: 0.5.
+ use_legacy_coordinate (bool): Whether to use coordinate system in
+ mmdet v1.x. which means width, height should be
+ calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+ Default: False.
+
+ Returns:
+ tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+ each array is (num_scales, m).
+ """
+ if metric == 'chamfer':
+ if threshold >0:
+ threshold= -threshold
+ # else:
+ # raise NotImplementedError
+
+ # import pdb;pdb.set_trace()
+ num_gens = gen_lines.shape[0]
+ num_gts = gt_lines.shape[0]
+
+ # tp and fp
+ tp = np.zeros((num_gens), dtype=np.float32)
+ fp = np.zeros((num_gens), dtype=np.float32)
+
+ # if there is no gt bboxes in this image, then all det bboxes
+ # within area range are false positives
+ if num_gts == 0:
+ fp[...] = 1
+ return tp, fp
+
+ if num_gens == 0:
+ return tp, fp
+
+ gen_scores = gen_lines[:,-1] # n
+ # distance matrix: n x m
+
+ matrix = custom_polyline_score(
+ gen_lines[:,:-1].reshape(num_gens,-1,2),
+ gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+ # for each det, the max iou with all gts
+ matrix_max = matrix.max(axis=1)
+ # for each det, which gt overlaps most with it
+ matrix_argmax = matrix.argmax(axis=1)
+ # sort all dets in descending order by scores
+ sort_inds = np.argsort(-gen_scores)
+
+ gt_covered = np.zeros(num_gts, dtype=bool)
+
+ # tp = 0 and fp = 0 means ignore this detected bbox,
+ for i in sort_inds:
+ if matrix_max[i] >= threshold:
+ matched_gt = matrix_argmax[i]
+ if not gt_covered[matched_gt]:
+ gt_covered[matched_gt] = True
+ tp[i] = 1
+ else:
+ fp[i] = 1
+ else:
+ fp[i] = 1
+
+ return tp, fp
+
diff --git a/mmcv/datasets/map_utils/tpfp_chamfer.py b/mmcv/datasets/map_utils/tpfp_chamfer.py
new file mode 100644
index 0000000..db55fdd
--- /dev/null
+++ b/mmcv/datasets/map_utils/tpfp_chamfer.py
@@ -0,0 +1,335 @@
+# from ..chamfer_dist import ChamferDistance
+import numpy as np
+from shapely.geometry import LineString, Polygon
+from shapely.strtree import STRtree
+from shapely.geometry import CAP_STYLE, JOIN_STYLE
+from scipy.spatial import distance
+import similaritymeasures
+
+# def chamfer_distance(pred_bbox, gt_bbox):
+
+# cd_dist_func = ChamferDistance.vec_cd_dist(
+# pred, pred_mask, tgt, tgt_mask)()
+
+
+def vec_iou(pred_lines, gt_lines):
+ '''
+ each line with 1 meter width
+ pred_lines: num_preds, npts, 2
+ gt_lines: num_gts, npts, 2
+ '''
+
+ num_preds = pred_lines.shape[0]
+ num_gts = gt_lines.shape[0]
+
+ pred_lines_shapely = \
+ [LineString(i).buffer(1.,
+ cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+ for i in pred_lines]
+ gt_lines_shapely =\
+ [LineString(i).buffer(1.,
+ cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+ for i in gt_lines]
+
+ # construct tree
+ tree = STRtree(gt_lines_shapely)
+ index_by_id = dict((id(pt), i) for i, pt in enumerate(gt_lines_shapely))
+
+ iou_matrix = np.zeros((num_preds, num_gts))
+
+ for i, pline in enumerate(pred_lines_shapely):
+
+ for o in tree.query(pline):
+ if o.intersects(pline):
+ gt_id = index_by_id[id(o)]
+
+ inter = o.intersection(pline).area
+ union = o.union(pline).area
+ iou_matrix[i, gt_id] = inter / union
+
+ return iou_matrix
+
+def convex_iou(pred_lines, gt_lines, gt_mask):
+ '''
+ each line with 1 meter width
+ pred_lines: num_preds, List [npts, 2]
+ gt_lines: num_gts, npts, 2
+ gt_mask: num_gts, npts, 2
+ '''
+
+ num_preds = len(pred_lines)
+ num_gts = len(gt_lines)
+
+ pred_lines_shapely = \
+ [Polygon(i).convex_hull for i in pred_lines]
+ gt_lines_shapely =\
+ [Polygon(i[m].reshape(-1,2)).convex_hull for i,m in zip(gt_lines,gt_mask)]
+
+ # construct tree
+ tree = STRtree(pred_lines_shapely)
+ index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+ iou_matrix = np.zeros((num_preds, num_gts))
+
+ for i, pline in enumerate(gt_lines_shapely):
+
+ for o in tree.query(pline):
+ if o.intersects(pline):
+ pred_id = index_by_id[id(o)]
+
+ inter = o.intersection(pline).area
+ union = o.union(pline).area
+ iou_matrix[pred_id, i] = inter / union
+
+ return iou_matrix
+
+def rbbox_iou(pred_lines, gt_lines, gt_mask):
+ '''
+ each line with 1 meter width
+ pred_lines: num_preds, List [npts, 2]
+ gt_lines: num_gts, npts, 2
+ gt_mask: num_gts, npts, 2
+ '''
+
+ num_preds = len(pred_lines)
+ num_gts = len(gt_lines)
+
+ pred_lines_shapely = \
+ [Polygon(i).minimum_rotated_rectangle for i in pred_lines]
+ gt_lines_shapely =\
+ [Polygon(i[m].reshape(-1,2)) for i,m in zip(gt_lines,gt_mask)]
+
+ # construct tree
+ tree = STRtree(pred_lines_shapely)
+ index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+ iou_matrix = np.zeros((num_preds, num_gts))
+
+ for i, pline in enumerate(gt_lines_shapely):
+
+ for o in tree.query(pline):
+ if o.intersects(pline):
+ pred_id = index_by_id[id(o)]
+
+ inter = o.intersection(pline).area
+ union = o.union(pline).area
+ iou_matrix[pred_id, i] = inter / union
+
+ return iou_matrix
+
+
+def polyline_score(pred_lines, gt_lines, linewidth=1., metric='POR'):
+ '''
+ each line with 1 meter width
+ pred_lines: num_preds, List [npts, 2]
+ gt_lines: num_gts, npts, 2
+ gt_mask: num_gts, npts, 2
+ '''
+ positive_threshold = 1.
+ num_preds = len(pred_lines)
+ num_gts = len(gt_lines)
+ line_length = pred_lines.shape[1]
+
+ # gt_lines = gt_lines + np.array((1.,1.))
+
+ pred_lines_shapely = \
+ [LineString(i).buffer(linewidth,
+ cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+ for i in pred_lines]
+ gt_lines_shapely =\
+ [LineString(i).buffer(linewidth,
+ cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+ for i in gt_lines]
+
+ # construct tree
+ tree = STRtree(pred_lines_shapely)
+ index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+ if metric=='POR':
+ iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+ elif metric=='frechet':
+ iou_matrix = np.full((num_preds, num_gts), -100.)
+ elif metric=='chamfer':
+ iou_matrix = np.full((num_preds, num_gts), -100.)
+ elif metric=='chamfer_v2':
+ iou_matrix = np.full((num_preds, num_gts), -100.)
+
+ for i, pline in enumerate(gt_lines_shapely):
+
+ for o in tree.query(pline):
+ if o.intersects(pline):
+ pred_id = index_by_id[id(o)]
+
+ if metric=='POR':
+ dist_mat = distance.cdist(
+ pred_lines[pred_id], gt_lines[i], 'euclidean')
+
+ valid_ab = (dist_mat.min(-1) < positive_threshold).sum()
+ valid_ba = (dist_mat.min(-2) < positive_threshold).sum()
+
+ iou_matrix[pred_id, i] = min(valid_ba,valid_ab) / line_length
+ # iou_matrix[pred_id, i] = ((valid_ba+valid_ab)/2) / line_length
+ # assert iou_matrix[pred_id, i] <= 1. and iou_matrix[pred_id, i] >= 0.
+ elif metric=='frechet':
+ fdistance_1 = \
+ -similaritymeasures.frechet_dist(pred_lines[pred_id], gt_lines[i])
+ fdistance_2 = \
+ -similaritymeasures.frechet_dist(pred_lines[pred_id][::-1], gt_lines[i])
+ fdistance = max(fdistance_1,fdistance_2)
+ iou_matrix[pred_id, i] = fdistance
+
+ elif metric=='chamfer':
+ dist_mat = distance.cdist(
+ pred_lines[pred_id], gt_lines[i], 'euclidean')
+
+ valid_ab = dist_mat.min(-1).sum()
+ valid_ba = dist_mat.min(-2).sum()
+
+ iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/(2*line_length)
+ # if iou_matrix[pred_id, i] == 0:
+ # import ipdb; ipdb.set_trace()
+ elif metric=='chamfer_v2':
+ dist_mat = distance.cdist(
+ pred_lines[pred_id], gt_lines[i], 'euclidean')
+
+ valid_ab = dist_mat.min(-1).sum()
+ valid_ba = dist_mat.min(-2).sum()
+
+ iou_matrix[pred_id, i] = -(valid_ba/pred_lines[pred_id].shape[0]
+ +valid_ab/gt_lines[i].shape[0])/2
+ # if iou_matrix[pred_id, i] == 0:
+ # import ipdb; ipdb.set_trace()
+
+
+ # if True:
+ # import matplotlib.pyplot as plt
+ # print('pred num', num_preds)
+ # print('gt num', num_gts)
+ # for i in range(num_preds):
+ # plt.plot(pred_lines[i][:,0],pred_lines[i][:,1],'-',color='red',alpha=0.5)
+ # for i in range(num_gts):
+ # plt.plot(gt_lines[i][:,0],gt_lines[i][:,1],'-',color='blue',alpha=0.5)
+ # plt.savefig('test.png')
+ # plt.close()
+ return iou_matrix
+
+
+def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'):
+ '''
+ each line with 1 meter width
+ pred_lines: num_preds, List [npts, 2]
+ gt_lines: num_gts, npts, 2
+ gt_mask: num_gts, npts, 2
+ '''
+ if metric == 'iou':
+ linewidth = 1.0
+ positive_threshold = 1.
+ num_preds = len(pred_lines)
+ num_gts = len(gt_lines)
+ line_length = pred_lines.shape[1]
+
+ # gt_lines = gt_lines + np.array((1.,1.))
+
+ pred_lines_shapely = \
+ [LineString(i).buffer(linewidth,
+ cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+ for i in pred_lines]
+ gt_lines_shapely =\
+ [LineString(i).buffer(linewidth,
+ cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+ for i in gt_lines]
+
+ # construct tree
+ tree = STRtree(pred_lines_shapely)
+ index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+
+ if metric=='chamfer':
+ iou_matrix = np.full((num_preds, num_gts), -100.)
+ elif metric=='iou':
+ iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+ else:
+ raise NotImplementedError
+
+ for i, pline in enumerate(gt_lines_shapely):
+
+ for o in tree.query(pline):
+ if o.intersects(pline):
+ pred_id = index_by_id[id(o)]
+
+ if metric=='chamfer':
+ dist_mat = distance.cdist(
+ pred_lines[pred_id], gt_lines[i], 'euclidean')
+ # import pdb;pdb.set_trace()
+ valid_ab = dist_mat.min(-1).mean()
+ valid_ba = dist_mat.min(-2).mean()
+
+ iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/2
+ elif metric=='iou':
+ inter = o.intersection(pline).area
+ union = o.union(pline).area
+ iou_matrix[pred_id, i] = inter / union
+
+ return iou_matrix
+
+if __name__ == '__main__':
+ import torch
+
+ line1 = torch.tensor([
+ [1, 5], [3, 5], [5, 5]
+ ])
+
+ line0 = torch.tensor([
+ [3, 6], [4, 8], [5, 6]
+ ])
+
+ line2 = torch.tensor([
+ [1, 4], [3, 4], [5, 4]
+ ])
+
+ line3 = torch.tensor([
+ [4, 4], [3, 3], [5, 3]
+ ])
+
+ gt = torch.stack((line2, line3), dim=0).type(torch.float32)
+ pred = torch.stack((line0, line1), dim=0).type(torch.float32)
+
+ # import ipdb; ipdb.set_trace()
+ import mmcv
+ # with mmcv.Timer():
+ # gt = upsampler(gt, pts=10)
+ # pred = upsampler(pred, pts=10)
+
+ import matplotlib.pyplot as plt
+ from shapely.geometry import LineString
+ from descartes import PolygonPatch
+
+ iou_matrix = vec_iou(pred,gt)
+ print(iou_matrix)
+ # import pdb;pdb.set_trace()
+ score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer')
+ print(score_matrix)
+ fig, ax = plt.subplots()
+ for i in gt:
+ i = i.numpy()
+ plt.plot(i[:, 0], i[:, 1], 'o', color='red')
+ plt.plot(i[:, 0], i[:, 1], '-', color='red')
+
+ dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+ patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1)
+ ax.add_patch(patch1)
+
+ for i in pred:
+ i = i.numpy()
+ plt.plot(i[:, 0], i[:, 1], 'o', color='blue')
+ plt.plot(i[:, 0], i[:, 1], '-', color='blue')
+
+ dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+ patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1)
+ ax.add_patch(patch1)
+
+
+ ax.axis('equal')
+
+
+ plt.savefig('test3.png')
\ No newline at end of file
diff --git a/mmcv/datasets/nuscenes_dataset.py b/mmcv/datasets/nuscenes_dataset.py
new file mode 100644
index 0000000..e9c76e0
--- /dev/null
+++ b/mmcv/datasets/nuscenes_dataset.py
@@ -0,0 +1,658 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pyquaternion
+import tempfile
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmcv.datasets import DATASETS
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from mmcv.core import show_result
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class NuScenesDataset(Custom3DDataset):
+ r"""NuScenes Dataset.
+
+ This class serves as the API for experiments on the NuScenes Dataset.
+
+ Please refer to `NuScenes Dataset `_
+ for data downloading.
+
+ Args:
+ ann_file (str): Path of annotation file.
+ pipeline (list[dict], optional): Pipeline used for data processing.
+ Defaults to None.
+ data_root (str): Path of dataset root.
+ classes (tuple[str], optional): Classes used in the dataset.
+ Defaults to None.
+ load_interval (int, optional): Interval of loading the dataset. It is
+ used to uniformly sample the dataset. Defaults to 1.
+ with_velocity (bool, optional): Whether include velocity prediction
+ into the experiments. Defaults to True.
+ modality (dict, optional): Modality to specify the sensor data used
+ as input. Defaults to None.
+ box_type_3d (str, optional): Type of 3D box of this dataset.
+ Based on the `box_type_3d`, the dataset will encapsulate the box
+ to its original format then converted them to `box_type_3d`.
+ Defaults to 'LiDAR' in this dataset. Available options includes.
+ - 'LiDAR': Box in LiDAR coordinates.
+ - 'Depth': Box in depth coordinates, usually for indoor dataset.
+ - 'Camera': Box in camera coordinates.
+ filter_empty_gt (bool, optional): Whether to filter empty GT.
+ Defaults to True.
+ test_mode (bool, optional): Whether the dataset is in test mode.
+ Defaults to False.
+ eval_version (bool, optional): Configuration version of evaluation.
+ Defaults to 'detection_cvpr_2019'.
+ use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+ file as mask to filter gt_boxes and gt_names. Defaults to False.
+ """
+ NameMapping = {
+ 'movable_object.barrier': 'barrier',
+ 'vehicle.bicycle': 'bicycle',
+ 'vehicle.bus.bendy': 'bus',
+ 'vehicle.bus.rigid': 'bus',
+ 'vehicle.car': 'car',
+ 'vehicle.construction': 'construction_vehicle',
+ 'vehicle.motorcycle': 'motorcycle',
+ 'human.pedestrian.adult': 'pedestrian',
+ 'human.pedestrian.child': 'pedestrian',
+ 'human.pedestrian.construction_worker': 'pedestrian',
+ 'human.pedestrian.police_officer': 'pedestrian',
+ 'movable_object.trafficcone': 'traffic_cone',
+ 'vehicle.trailer': 'trailer',
+ 'vehicle.truck': 'truck'
+ }
+ DefaultAttribute = {
+ 'car': 'vehicle.parked',
+ 'pedestrian': 'pedestrian.moving',
+ 'trailer': 'vehicle.parked',
+ 'truck': 'vehicle.parked',
+ 'bus': 'vehicle.moving',
+ 'motorcycle': 'cycle.without_rider',
+ 'construction_vehicle': 'vehicle.parked',
+ 'bicycle': 'cycle.without_rider',
+ 'barrier': '',
+ 'traffic_cone': '',
+ }
+ AttrMapping = {
+ 'cycle.with_rider': 0,
+ 'cycle.without_rider': 1,
+ 'pedestrian.moving': 2,
+ 'pedestrian.standing': 3,
+ 'pedestrian.sitting_lying_down': 4,
+ 'vehicle.moving': 5,
+ 'vehicle.parked': 6,
+ 'vehicle.stopped': 7,
+ }
+ AttrMapping_rev = [
+ 'cycle.with_rider',
+ 'cycle.without_rider',
+ 'pedestrian.moving',
+ 'pedestrian.standing',
+ 'pedestrian.sitting_lying_down',
+ 'vehicle.moving',
+ 'vehicle.parked',
+ 'vehicle.stopped',
+ ]
+ # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+ ErrNameMapping = {
+ 'trans_err': 'mATE',
+ 'scale_err': 'mASE',
+ 'orient_err': 'mAOE',
+ 'vel_err': 'mAVE',
+ 'attr_err': 'mAAE'
+ }
+ CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+ 'barrier')
+
+ def __init__(self,
+ ann_file,
+ pipeline=None,
+ data_root=None,
+ classes=None,
+ load_interval=1,
+ with_velocity=True,
+ modality=None,
+ box_type_3d='LiDAR',
+ filter_empty_gt=True,
+ test_mode=False,
+ eval_version='detection_cvpr_2019',
+ use_valid_flag=False):
+ self.load_interval = load_interval
+ self.use_valid_flag = use_valid_flag
+ super().__init__(
+ data_root=data_root,
+ ann_file=ann_file,
+ pipeline=pipeline,
+ classes=classes,
+ modality=modality,
+ box_type_3d=box_type_3d,
+ filter_empty_gt=filter_empty_gt,
+ test_mode=test_mode)
+
+ self.with_velocity = with_velocity
+ self.eval_version = eval_version
+ from nuscenes.eval.detection.config import config_factory
+ self.eval_detection_configs = config_factory(self.eval_version)
+ # self.eval_detection_configs.class_names = list(self.eval_detection_configs.class_names)
+ if self.modality is None:
+ self.modality = dict(
+ use_camera=False,
+ use_lidar=True,
+ use_radar=False,
+ use_map=False,
+ use_external=False,
+ )
+
+ def get_cat_ids(self, idx):
+ """Get category distribution of single scene.
+
+ Args:
+ idx (int): Index of the data_info.
+
+ Returns:
+ dict[list]: for each category, if the current scene
+ contains such boxes, store a list containing idx,
+ otherwise, store empty list.
+ """
+ info = self.data_infos[idx]
+ if self.use_valid_flag:
+ mask = info['valid_flag']
+ gt_names = set(info['gt_names'][mask])
+ else:
+ gt_names = set(info['gt_names'])
+
+ cat_ids = []
+ for name in gt_names:
+ if name in self.CLASSES:
+ cat_ids.append(self.cat2id[name])
+ return cat_ids
+
+ def load_annotations(self, ann_file):
+ """Load annotations from ann_file.
+
+ Args:
+ ann_file (str): Path of the annotation file.
+
+ Returns:
+ list[dict]: List of annotations sorted by timestamps.
+ """
+ data = load(ann_file)
+ data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+ data_infos = data_infos[::self.load_interval]
+ self.metadata = data['metadata']
+ self.version = self.metadata['version']
+ return data_infos
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+ # standard protocal modified from SECOND.Pytorch
+ input_dict = dict(
+ sample_idx=info['token'],
+ pts_filename=info['lidar_path'],
+ sweeps=info['sweeps'],
+ timestamp=info['timestamp'] / 1e6,
+ )
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ for cam_type, cam_info in info['cams'].items():
+ image_paths.append(cam_info['data_path'])
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+ lidar2cam_t = cam_info[
+ 'sensor2lidar_translation'] @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ intrinsic = cam_info['cam_intrinsic']
+ viewpad = np.eye(4)
+ viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ lidar2img_rts.append(lidar2img_rt)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ ))
+
+ if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+
+ return input_dict
+
+ def get_ann_info(self, index):
+ """Get annotation info according to the given index.
+
+ Args:
+ index (int): Index of the annotation data to get.
+
+ Returns:
+ dict: Annotation information consists of the following keys:
+
+ - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+ 3D ground truth bboxes
+ - gt_labels_3d (np.ndarray): Labels of ground truths.
+ - gt_names (list[str]): Class names of ground truths.
+ """
+ info = self.data_infos[index]
+ # filter out bbox containing no points
+ if self.use_valid_flag:
+ mask = info['valid_flag']
+ else:
+ mask = info['num_lidar_pts'] > 0
+ gt_bboxes_3d = info['gt_boxes'][mask]
+ gt_names_3d = info['gt_names'][mask]
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ if self.with_velocity:
+ gt_velocity = info['gt_velocity'][mask]
+ nan_mask = np.isnan(gt_velocity[:, 0])
+ gt_velocity[nan_mask] = [0.0, 0.0]
+ gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+ # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ gt_names=gt_names_3d)
+ return anns_results
+
+ def _format_bbox(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+
+ Returns:
+ str: Path of the output json file.
+ """
+
+ # import pdb
+ # pdb.set_trace()
+ nusc_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(track_iter_progress(results)):
+ annos = []
+ boxes = output_to_nusc_box(det)
+ sample_token = self.data_infos[sample_id]['token']
+ boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+ mapped_class_names,
+ self.eval_detection_configs,
+ self.eval_version)
+ for i, box in enumerate(boxes):
+ name = mapped_class_names[box.label]
+ if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+ if name in [
+ 'car',
+ 'construction_vehicle',
+ 'bus',
+ 'truck',
+ 'trailer',
+ ]:
+ attr = 'vehicle.moving'
+ elif name in ['bicycle', 'motorcycle']:
+ attr = 'cycle.with_rider'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+ else:
+ if name in ['pedestrian']:
+ attr = 'pedestrian.standing'
+ elif name in ['bus']:
+ attr = 'vehicle.stopped'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box.center.tolist(),
+ size=box.wlh.tolist(),
+ rotation=box.orientation.elements.tolist(),
+ velocity=box.velocity[:2].tolist(),
+ detection_name=name,
+ detection_score=box.score,
+ attribute_name=attr)
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ }
+
+ #pdb.set_trace()
+
+ mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+ from nuscenes import NuScenes
+ from nuscenes.eval.detection.evaluate import NuScenesEval
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+ nusc = NuScenes(
+ version=self.version, dataroot=self.data_root, verbose=False)
+ eval_set_map = {
+ 'v1.0-mini': 'mini_val',
+ 'v1.0-trainval': 'val',
+ }
+ nusc_eval = NuScenesEval(
+ nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=False)
+ nusc_eval.main(render_curves=False)
+
+ # record metrics
+ metrics = load(osp.join(output_dir, 'metrics_summary.json'))
+ detail = dict()
+ metric_prefix = f'{result_name}_NuScenes'
+ for name in self.CLASSES:
+ for k, v in metrics['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,
+ self.ErrNameMapping[k])] = val
+
+ detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+ return detail
+
+ def format_results(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ assert len(results) == len(self), (
+ 'The length of results is not equal to the dataset len: {} != {}'.
+ format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ # currently the output prediction results could be in two formats
+ # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+ # 2. list of dict('pts_bbox' or 'img_bbox':
+ # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+ # this is a workaround to enable evaluation of both formats on nuScenes
+ # refer to https://github.com/open-mmlab/mmcvection3d/issues/449
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+ return result_files, tmp_dir
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+ if isinstance(result_files, dict):
+ results_dict = dict()
+ for name in result_names:
+ print('Evaluating bboxes of {}'.format(name))
+ ret_dict = self._evaluate_single(result_files[name])
+ results_dict.update(ret_dict)
+ elif isinstance(result_files, str):
+ results_dict = self._evaluate_single(result_files)
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+ return results_dict
+
+ def _build_default_pipeline(self):
+ """Build the default pipeline for this dataset."""
+ pipeline = [
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=5,
+ file_client_args=dict(backend='disk')),
+ dict(
+ type='LoadPointsFromMultiSweeps',
+ sweeps_num=10,
+ file_client_args=dict(backend='disk')),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=self.CLASSES,
+ with_label=False),
+ dict(type='Collect3D', keys=['points'])
+ ]
+ return Compose(pipeline)
+
+ def show(self, results, out_dir, show=True, pipeline=None):
+ """Results visualization.
+
+ Args:
+ results (list[dict]): List of bounding boxes results.
+ out_dir (str): Output directory of visualization result.
+ show (bool): Visualize the results online.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+ """
+ assert out_dir is not None, 'Expect out_dir, got none.'
+ pipeline = self._get_pipeline(pipeline)
+ for i, result in enumerate(results):
+ if 'pts_bbox' in result.keys():
+ result = result['pts_bbox']
+ data_info = self.data_infos[i]
+ pts_path = data_info['lidar_path']
+ file_name = osp.split(pts_path)[-1].split('.')[0]
+ points = self._extract_data(i, pipeline, 'points').numpy()
+ # for now we convert points into depth mode
+ points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+ Coord3DMode.DEPTH)
+ inds = result['scores_3d'] > 0.1
+ gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+ show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+ Box3DMode.DEPTH)
+ pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
+ show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+ Box3DMode.DEPTH)
+ show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+ file_name, show)
+
+
+def output_to_nusc_box(detection):
+ """Convert the output to the box class in the nuScenes.
+
+ Args:
+ detection (dict): Detection results.
+
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+
+ Returns:
+ list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+ """
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+ # TODO: check whether this is necessary
+ # with dir_offset & dir_limit in the head
+ box_yaw = -box_yaw - np.pi / 2
+
+ box_list = []
+ for i in range(len(box3d)):
+ quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ velocity = (*box3d.tensor[i, 7:9], 0.0)
+ # velo_val = np.linalg.norm(box3d[i, 7:9])
+ # velo_ori = box3d[i, 6]
+ # velocity = (
+ # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+ box = NuScenesBox(
+ box_gravity_center[i],
+ box_dims[i],
+ quat,
+ label=labels[i],
+ score=scores[i],
+ velocity=velocity)
+ box_list.append(box)
+ return box_list
+
+
+def lidar_nusc_box_to_global(info,
+ boxes,
+ classes,
+ eval_configs,
+ eval_version='detection_cvpr_2019'):
+ """Convert the box from ego to global coordinate.
+
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ classes (list[str]): Mapped classes in the evaluation.
+ eval_configs (object): Evaluation configuration object.
+ eval_version (str): Evaluation version.
+ Default: 'detection_cvpr_2019'
+
+ Returns:
+ list: List of standard NuScenesBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ for box in boxes:
+ # Move box to ego vehicle coord system
+ box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+ box.translate(np.array(info['lidar2ego_translation']))
+ # filter det in ego.
+ cls_range_map = eval_configs.class_range
+ radius = np.linalg.norm(box.center[:2], 2)
+ det_range = cls_range_map[classes[box.label]]
+ if radius > det_range:
+ continue
+ # Move box to global coord system
+ box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+ box.translate(np.array(info['ego2global_translation']))
+ box_list.append(box)
+ return box_list
diff --git a/mmcv/datasets/nuscenes_e2e_dataset.py b/mmcv/datasets/nuscenes_e2e_dataset.py
new file mode 100644
index 0000000..38b3ffc
--- /dev/null
+++ b/mmcv/datasets/nuscenes_e2e_dataset.py
@@ -0,0 +1,1247 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156) #
+# Source code: https://github.com/OpenDriveLab/UniAD #
+# Copyright (c) OpenDriveLab. All rights reserved. #
+#---------------------------------------------------------------------------------#
+
+import copy
+import numpy as np
+import torch
+from mmcv.datasets import DATASETS
+from mmcv.datasets.pipelines import to_tensor
+from mmcv.datasets import NuScenesDataset
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.file_client import FileClient
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from os import path as osp
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .eval_utils.nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.tracking.evaluate import TrackingEval
+from .eval_utils.nuscenes_eval_motion import MotionEval
+from nuscenes.eval.common.config import config_factory
+import tempfile
+from mmcv.parallel import DataContainer as DC
+import random
+import pickle
+from prettytable import PrettyTable
+
+from nuscenes import NuScenes
+from mmcv.datasets.data_utils.vector_map import VectorizedLocalMap
+from mmcv.datasets.data_utils.rasterize import preprocess_map
+from mmcv.datasets.eval_utils.map_api import NuScenesMap
+from mmcv.datasets.data_utils.trajectory_api import NuScenesTraj
+from .data_utils.data_utils import lidar_nusc_box_to_global, obtain_map_info, output_to_nusc_box, output_to_nusc_box_det
+from nuscenes.prediction import convert_local_coords_to_global
+
+
+@DATASETS.register_module()
+class NuScenesE2EDataset(NuScenesDataset):
+ r"""NuScenes E2E Dataset.
+
+ This dataset only add camera intrinsics and extrinsics to the results.
+ """
+
+ def __init__(self,
+ queue_length=4,
+ bev_size=(200, 200),
+ patch_size=(102.4, 102.4),
+ canvas_size=(200, 200),
+ overlap_test=False,
+ predict_steps=12,
+ planning_steps=6,
+ past_steps=4,
+ fut_steps=4,
+ use_nonlinear_optimizer=False,
+ lane_ann_file=None,
+ eval_mod=None,
+
+ # For debug
+ is_debug=False,
+ len_debug=30,
+
+ # Occ dataset
+ enbale_temporal_aug=False,
+ occ_receptive_field=3,
+ occ_n_future=4,
+ occ_filter_invalid_sample=False,
+ occ_filter_by_valid_flag=False,
+
+ file_client_args=dict(backend='disk'),
+ *args,
+ **kwargs):
+ # init before super init since it is called in parent class
+ self.file_client_args = file_client_args
+ self.file_client = FileClient(**file_client_args)
+
+ self.is_debug = is_debug
+ self.len_debug = len_debug
+ super().__init__(*args, **kwargs)
+ self.queue_length = queue_length
+ self.overlap_test = overlap_test
+ self.bev_size = bev_size
+ self.predict_steps = predict_steps
+ self.planning_steps = planning_steps
+ self.past_steps = past_steps
+ self.fut_steps = fut_steps
+ self.scene_token = None
+ self.lane_infos = self.load_annotations(lane_ann_file) \
+ if lane_ann_file else None
+ self.eval_mod = eval_mod
+
+ self.use_nonlinear_optimizer = use_nonlinear_optimizer
+
+ self.nusc = NuScenes(version=self.version,
+ dataroot=self.data_root, verbose=True)
+
+ self.map_num_classes = 3
+ if canvas_size[0] == 50:
+ self.thickness = 1
+ elif canvas_size[0] == 200:
+ self.thickness = 2
+ else:
+ assert False
+ self.angle_class = 36
+ self.patch_size = patch_size
+ self.canvas_size = canvas_size
+ self.nusc_maps = {
+ 'boston-seaport': NuScenesMap(dataroot=self.data_root, map_name='boston-seaport'),
+ 'singapore-hollandvillage': NuScenesMap(dataroot=self.data_root, map_name='singapore-hollandvillage'),
+ 'singapore-onenorth': NuScenesMap(dataroot=self.data_root, map_name='singapore-onenorth'),
+ 'singapore-queenstown': NuScenesMap(dataroot=self.data_root, map_name='singapore-queenstown'),
+ }
+ self.vector_map = VectorizedLocalMap(
+ self.data_root,
+ patch_size=self.patch_size,
+ canvas_size=self.canvas_size)
+ self.traj_api = NuScenesTraj(self.nusc,
+ self.predict_steps,
+ self.planning_steps,
+ self.past_steps,
+ self.fut_steps,
+ self.with_velocity,
+ self.CLASSES,
+ self.box_mode_3d,
+ self.use_nonlinear_optimizer)
+
+ # Occ
+ self.enbale_temporal_aug = enbale_temporal_aug
+ assert self.enbale_temporal_aug is False
+
+ self.occ_receptive_field = occ_receptive_field # past + current
+ self.occ_n_future = occ_n_future # future only
+ self.occ_filter_invalid_sample = occ_filter_invalid_sample
+ self.occ_filter_by_valid_flag = occ_filter_by_valid_flag
+ self.occ_only_total_frames = 7 # NOTE: hardcode, not influenced by planning
+
+ def __len__(self):
+ if not self.is_debug:
+ return len(self.data_infos)
+ else:
+ return self.len_debug
+
+ def load_annotations(self, ann_file):
+ """Load annotations from ann_file.
+ Args:
+ ann_file (str): Path of the annotation file.
+
+ Returns:
+ list[dict]: List of annotations sorted by timestamps.
+ """
+ if self.file_client_args['backend'] == 'disk':
+ # data_infos = mmcv.load(ann_file)
+ data = pickle.loads(self.file_client.get(ann_file))
+ data_infos = list(
+ sorted(data['infos'], key=lambda e: e['timestamp']))
+ data_infos = data_infos[::self.load_interval]
+ self.metadata = data['metadata']
+ self.version = self.metadata['version']
+ elif self.file_client_args['backend'] == 'petrel':
+ data = pickle.loads(self.file_client.get(ann_file))
+ data_infos = list(
+ sorted(data['infos'], key=lambda e: e['timestamp']))
+ data_infos = data_infos[::self.load_interval]
+ self.metadata = data['metadata']
+ self.version = self.metadata['version']
+ else:
+ assert False, 'Invalid file_client_args!'
+ return data_infos
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ img: queue_length, 6, 3, H, W
+ img_metas: img_metas of each frame (list)
+ gt_globals_3d: gt_globals of each frame (list)
+ gt_bboxes_3d: gt_bboxes of each frame (list)
+ gt_inds: gt_inds of each frame (list)
+ """
+ data_queue = []
+ self.enbale_temporal_aug = False
+ if self.enbale_temporal_aug:
+ # temporal aug
+ prev_indexs_list = list(range(index-self.queue_length, index))
+ random.shuffle(prev_indexs_list)
+ prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+ input_dict = self.get_data_info(index)
+ else:
+ # ensure the first and final frame in same scene
+ final_index = index
+ first_index = index - self.queue_length + 1
+ if first_index < 0:
+ return None
+ if self.data_infos[first_index]['scene_token'] != \
+ self.data_infos[final_index]['scene_token']:
+ return None
+ # current timestamp
+ input_dict = self.get_data_info(final_index)
+ prev_indexs_list = list(reversed(range(first_index, final_index)))
+ if input_dict is None:
+ return None
+ frame_idx = input_dict['frame_idx']
+ scene_token = input_dict['scene_token']
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+
+ assert example['gt_labels_3d'].data.shape[0] == example['gt_fut_traj'].shape[0]
+ assert example['gt_labels_3d'].data.shape[0] == example['gt_past_traj'].shape[0]
+
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ data_queue.insert(0, example)
+
+ # retrieve previous infos
+
+ for i in prev_indexs_list:
+ if self.enbale_temporal_aug:
+ i = max(0, i)
+ input_dict = self.get_data_info(i)
+ if input_dict is None:
+ return None
+ if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token:
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+ return None
+ frame_idx = input_dict['frame_idx']
+ assert example['gt_labels_3d'].data.shape[0] == example['gt_fut_traj'].shape[0]
+ assert example['gt_labels_3d'].data.shape[0] == example['gt_past_traj'].shape[0]
+ data_queue.insert(0, copy.deepcopy(example))
+ data_queue = self.union2one(data_queue)
+ return data_queue
+
+ def prepare_test_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ img: queue_length, 6, 3, H, W
+ img_metas: img_metas of each frame (list)
+ gt_labels_3d: gt_labels of each frame (list)
+ gt_bboxes_3d: gt_bboxes of each frame (list)
+ gt_inds: gt_inds of each frame(list)
+ """
+
+ input_dict = self.get_data_info(index)
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ data_dict = {}
+ for key, value in example.items():
+ if 'l2g' in key:
+ data_dict[key] = to_tensor(value[0])
+ else:
+ data_dict[key] = value
+ return data_dict
+
+ def union2one(self, queue):
+ """
+ convert sample dict into one single sample.
+ """
+ imgs_list = [each['img'].data for each in queue]
+ gt_labels_3d_list = [each['gt_labels_3d'].data for each in queue]
+ gt_sdc_label_list = [each['gt_sdc_label'].data for each in queue]
+ gt_inds_list = [to_tensor(each['gt_inds']) for each in queue]
+ gt_bboxes_3d_list = [each['gt_bboxes_3d'].data for each in queue]
+ gt_past_traj_list = [to_tensor(each['gt_past_traj']) for each in queue]
+ gt_past_traj_mask_list = [
+ to_tensor(each['gt_past_traj_mask']) for each in queue]
+ gt_sdc_bbox_list = [each['gt_sdc_bbox'].data for each in queue]
+ l2g_r_mat_list = [to_tensor(each['l2g_r_mat']) for each in queue]
+ l2g_t_list = [to_tensor(each['l2g_t']) for each in queue]
+ timestamp_list = [to_tensor(each['timestamp']) for each in queue]
+ gt_fut_traj = to_tensor(queue[-1]['gt_fut_traj'])
+ gt_fut_traj_mask = to_tensor(queue[-1]['gt_fut_traj_mask'])
+ # gt_sdc_fut_traj = to_tensor(queue[-1]['gt_sdc_fut_traj'])
+ # gt_sdc_fut_traj_mask = to_tensor(queue[-1]['gt_sdc_fut_traj_mask'])
+ # gt_future_boxes_list = queue[-1]['gt_future_boxes']
+ # gt_future_labels_list = [to_tensor(each)
+ # for each in queue[-1]['gt_future_labels']]
+
+ metas_map = {}
+ prev_pos = None
+ prev_angle = None
+ for i, each in enumerate(queue):
+ metas_map[i] = each['img_metas'].data
+ if i == 0:
+ metas_map[i]['prev_bev'] = False
+ prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] = 0
+ metas_map[i]['can_bus'][-1] = 0
+ else:
+ metas_map[i]['prev_bev'] = True
+ tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] -= prev_pos
+ metas_map[i]['can_bus'][-1] -= prev_angle
+ prev_pos = copy.deepcopy(tmp_pos)
+ prev_angle = copy.deepcopy(tmp_angle)
+
+ queue[-1]['img'] = DC(torch.stack(imgs_list),
+ cpu_only=False, stack=True)
+ queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[-1]
+
+ queue['gt_labels_3d'] = DC(gt_labels_3d_list)
+ queue['gt_sdc_label'] = DC(gt_sdc_label_list)
+ queue['gt_inds'] = DC(gt_inds_list)
+ queue['gt_bboxes_3d'] = DC(gt_bboxes_3d_list, cpu_only=True)
+ queue['gt_sdc_bbox'] = DC(gt_sdc_bbox_list, cpu_only=True)
+ queue['l2g_r_mat'] = DC(l2g_r_mat_list)
+ queue['l2g_t'] = DC(l2g_t_list)
+ queue['timestamp'] = DC(timestamp_list)
+ queue['gt_fut_traj'] = DC(gt_fut_traj)
+ queue['gt_fut_traj_mask'] = DC(gt_fut_traj_mask)
+ queue['gt_past_traj'] = DC(gt_past_traj_list)
+ queue['gt_past_traj_mask'] = DC(gt_past_traj_mask_list)
+ # queue['gt_future_boxes'] = DC(gt_future_boxes_list, cpu_only=True)
+ # queue['gt_future_labels'] = DC(gt_future_labels_list)
+ return queue
+
+ def get_ann_info(self, index):
+ """Get annotation info according to the given index.
+
+ Args:
+ index (int): Index of the annotation data to get.
+
+ Returns:
+ dict: Annotation information consists of the following keys:
+
+ - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+ 3D ground truth bboxes
+ - gt_labels_3d (np.ndarray): Labels of ground truths.
+ - gt_names (list[str]): Class names of ground truths.
+ - gt_inds (np.ndarray): Instance ids of ground truths.
+ - gt_fut_traj (np.ndarray): .
+ - gt_fut_traj_mask (np.ndarray): .
+ """
+ info = self.data_infos[index]
+ # filter out bbox containing no points
+ if self.use_valid_flag:
+ mask = info['valid_flag']
+ else:
+ mask = info['num_lidar_pts'] > 0
+ gt_bboxes_3d = info['gt_boxes'][mask]
+ gt_names_3d = info['gt_names'][mask]
+ gt_inds = info['gt_inds'][mask]
+
+ sample = self.nusc.get('sample', info['token'])
+ ann_tokens = np.array(sample['anns'])[mask]
+ assert ann_tokens.shape[0] == gt_bboxes_3d.shape[0]
+
+ gt_fut_traj, gt_fut_traj_mask, gt_past_traj, gt_past_traj_mask = self.traj_api.get_traj_label(
+ info['token'], ann_tokens)
+
+ sdc_vel = self.traj_api.sdc_vel_info[info['token']]
+ gt_sdc_bbox, gt_sdc_label = self.traj_api.generate_sdc_info(sdc_vel)
+ gt_sdc_fut_traj, gt_sdc_fut_traj_mask = self.traj_api.get_sdc_traj_label(
+ info['token'])
+
+ sdc_planning, sdc_planning_mask, command = self.traj_api.get_sdc_planning_label(
+ info['token'])
+
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ if self.with_velocity:
+ gt_velocity = info['gt_velocity'][mask]
+ nan_mask = np.isnan(gt_velocity[:, 0])
+ gt_velocity[nan_mask] = [0.0, 0.0]
+ gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+ # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ gt_names=gt_names_3d,
+ gt_inds=gt_inds,
+ gt_fut_traj=gt_fut_traj,
+ gt_fut_traj_mask=gt_fut_traj_mask,
+ gt_past_traj=gt_past_traj,
+ gt_past_traj_mask=gt_past_traj_mask,
+ gt_sdc_bbox=gt_sdc_bbox,
+ gt_sdc_label=gt_sdc_label,
+ gt_sdc_fut_traj=gt_sdc_fut_traj,
+ gt_sdc_fut_traj_mask=gt_sdc_fut_traj_mask,
+ sdc_planning=sdc_planning,
+ sdc_planning_mask=sdc_planning_mask,
+ command=command,
+ )
+ assert gt_fut_traj.shape[0] == gt_labels_3d.shape[0]
+ assert gt_past_traj.shape[0] == gt_labels_3d.shape[0]
+ return anns_results
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+
+ # semantic format
+ lane_info = self.lane_infos[index] if self.lane_infos else None
+ # panoptic format
+ location = self.nusc.get('log', self.nusc.get(
+ 'scene', info['scene_token'])['log_token'])['location']
+ vectors = self.vector_map.gen_vectorized_samples(location,
+ info['ego2global_translation'],
+ info['ego2global_rotation'])
+ semantic_masks, instance_masks, forward_masks, backward_masks = preprocess_map(vectors,
+ self.patch_size,
+ self.canvas_size,
+ self.map_num_classes,
+ self.thickness,
+ self.angle_class)
+ instance_masks = np.rot90(instance_masks, k=-1, axes=(1, 2))
+ instance_masks = torch.tensor(instance_masks.copy())
+ gt_labels = []
+ gt_bboxes = []
+ gt_masks = []
+ for cls in range(self.map_num_classes):
+ for i in np.unique(instance_masks[cls]):
+ if i == 0:
+ continue
+ gt_mask = (instance_masks[cls] == i).to(torch.uint8)
+ ys, xs = np.where(gt_mask)
+ gt_bbox = [min(xs), min(ys), max(xs), max(ys)]
+ gt_labels.append(cls)
+ gt_bboxes.append(gt_bbox)
+ gt_masks.append(gt_mask)
+ map_mask = obtain_map_info(self.nusc,
+ self.nusc_maps,
+ info,
+ patch_size=self.patch_size,
+ canvas_size=self.canvas_size,
+ layer_names=['lane_divider', 'road_divider'])
+ map_mask = np.flip(map_mask, axis=1)
+ map_mask = np.rot90(map_mask, k=-1, axes=(1, 2))
+ map_mask = torch.tensor(map_mask.copy())
+ for i, gt_mask in enumerate(map_mask[:-1]):
+ ys, xs = np.where(gt_mask)
+ gt_bbox = [min(xs), min(ys), max(xs), max(ys)]
+ gt_labels.append(i + self.map_num_classes)
+ gt_bboxes.append(gt_bbox)
+ gt_masks.append(gt_mask)
+ gt_labels = torch.tensor(gt_labels)
+ gt_bboxes = torch.tensor(np.stack(gt_bboxes))
+ gt_masks = torch.stack(gt_masks)
+
+ # standard protocal modified from SECOND.Pytorch
+ input_dict = dict(
+ sample_idx=info['token'],
+ pts_filename=info['lidar_path'],
+ sweeps=info['sweeps'],
+ ego2global_translation=info['ego2global_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ prev_idx=info['prev'],
+ next_idx=info['next'],
+ scene_token=info['scene_token'],
+ can_bus=info['can_bus'],
+ frame_idx=info['frame_idx'],
+ timestamp=info['timestamp'] / 1e6,
+ map_filename=lane_info['maps']['map_mask'] if lane_info else None,
+ gt_lane_labels=gt_labels,
+ gt_lane_bboxes=gt_bboxes,
+ gt_lane_masks=gt_masks,
+ )
+
+ l2e_r = info['lidar2ego_rotation']
+ l2e_t = info['lidar2ego_translation']
+ e2g_r = info['ego2global_rotation']
+ e2g_t = info['ego2global_translation']
+ l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+ e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+ l2g_r_mat = l2e_r_mat.T @ e2g_r_mat.T
+ l2g_t = l2e_t @ e2g_r_mat.T + e2g_t
+
+ input_dict.update(
+ dict(
+ l2g_r_mat=l2g_r_mat.astype(np.float32),
+ l2g_t=l2g_t.astype(np.float32)))
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ for cam_type, cam_info in info['cams'].items():
+ image_paths.append(cam_info['data_path'])
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+ lidar2cam_t = cam_info[
+ 'sensor2lidar_translation'] @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ intrinsic = cam_info['cam_intrinsic']
+ viewpad = np.eye(4)
+ viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ lidar2img_rts.append(lidar2img_rt)
+
+ cam_intrinsics.append(viewpad)
+ lidar2cam_rts.append(lidar2cam_rt.T)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam_intrinsic=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ ))
+
+ # if not self.test_mode:
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+ if 'sdc_planning' in input_dict['ann_info'].keys():
+ input_dict['sdc_planning'] = input_dict['ann_info']['sdc_planning']
+ input_dict['sdc_planning_mask'] = input_dict['ann_info']['sdc_planning_mask']
+ input_dict['command'] = input_dict['ann_info']['command']
+
+ rotation = Quaternion(input_dict['ego2global_rotation'])
+ translation = input_dict['ego2global_translation']
+ can_bus = input_dict['can_bus']
+ can_bus[:3] = translation
+ can_bus[3:7] = rotation
+ patch_angle = quaternion_yaw(rotation) / np.pi * 180
+ if patch_angle < 0:
+ patch_angle += 360
+ can_bus[-2] = patch_angle / 180 * np.pi
+ can_bus[-1] = patch_angle
+
+ # TODO: Warp all those below occupancy-related codes into a function
+ prev_indices, future_indices = self.occ_get_temporal_indices(
+ index, self.occ_receptive_field, self.occ_n_future)
+
+ # ego motions of all frames are needed
+ all_frames = prev_indices + [index] + future_indices
+
+ # whether invalid frames is present
+ #
+ has_invalid_frame = -1 in all_frames[:self.occ_only_total_frames]
+ # NOTE: This can only represent 7 frames in total as it influence evaluation
+ input_dict['occ_has_invalid_frame'] = has_invalid_frame
+ input_dict['occ_img_is_valid'] = np.array(all_frames) >= 0
+
+ # might have None if not in the same sequence
+ future_frames = [index] + future_indices
+
+ # get lidar to ego to global transforms for each curr and fut index
+ occ_transforms = self.occ_get_transforms(
+ future_frames) # might have None
+ input_dict.update(occ_transforms)
+
+ # for (current and) future frames, detection labels are needed
+ # generate detection labels for current + future frames
+ input_dict['occ_future_ann_infos'] = \
+ self.get_future_detection_infos(future_frames)
+ return input_dict
+
+ def get_future_detection_infos(self, future_frames):
+ detection_ann_infos = []
+ for future_frame in future_frames:
+ if future_frame >= 0:
+ detection_ann_infos.append(
+ self.occ_get_detection_ann_info(future_frame),
+ )
+ else:
+ detection_ann_infos.append(None)
+ return detection_ann_infos
+
+ def occ_get_temporal_indices(self, index, receptive_field, n_future):
+ current_scene_token = self.data_infos[index]['scene_token']
+
+ # generate the past
+ previous_indices = []
+
+ for t in range(- receptive_field + 1, 0):
+ index_t = index + t
+ if index_t >= 0 and self.data_infos[index_t]['scene_token'] == current_scene_token:
+ previous_indices.append(index_t)
+ else:
+ previous_indices.append(-1) # for invalid indices
+
+ # generate the future
+ future_indices = []
+
+ for t in range(1, n_future + 1):
+ index_t = index + t
+ if index_t < len(self.data_infos) and self.data_infos[index_t]['scene_token'] == current_scene_token:
+ future_indices.append(index_t)
+ else:
+ # NOTE: How to deal the invalid indices???
+ future_indices.append(-1)
+
+ return previous_indices, future_indices
+
+ def occ_get_transforms(self, indices, data_type=torch.float32):
+ """
+ get l2e, e2g rotation and translation for each valid frame
+ """
+ l2e_r_mats = []
+ l2e_t_vecs = []
+ e2g_r_mats = []
+ e2g_t_vecs = []
+
+ for index in indices:
+ if index == -1:
+ l2e_r_mats.append(None)
+ l2e_t_vecs.append(None)
+ e2g_r_mats.append(None)
+ e2g_t_vecs.append(None)
+ else:
+ info = self.data_infos[index]
+ l2e_r = info['lidar2ego_rotation']
+ l2e_t = info['lidar2ego_translation']
+ e2g_r = info['ego2global_rotation']
+ e2g_t = info['ego2global_translation']
+
+ l2e_r_mat = torch.from_numpy(Quaternion(l2e_r).rotation_matrix)
+ e2g_r_mat = torch.from_numpy(Quaternion(e2g_r).rotation_matrix)
+
+ l2e_r_mats.append(l2e_r_mat.to(data_type))
+ l2e_t_vecs.append(torch.tensor(l2e_t).to(data_type))
+ e2g_r_mats.append(e2g_r_mat.to(data_type))
+ e2g_t_vecs.append(torch.tensor(e2g_t).to(data_type))
+
+ res = {
+ 'occ_l2e_r_mats': l2e_r_mats,
+ 'occ_l2e_t_vecs': l2e_t_vecs,
+ 'occ_e2g_r_mats': e2g_r_mats,
+ 'occ_e2g_t_vecs': e2g_t_vecs,
+ }
+
+ return res
+
+ def occ_get_detection_ann_info(self, index):
+ info = self.data_infos[index].copy()
+ gt_bboxes_3d = info['gt_boxes'].copy()
+ gt_names_3d = info['gt_names'].copy()
+ gt_ins_inds = info['gt_inds'].copy()
+
+ gt_vis_tokens = info.get('visibility_tokens', None)
+
+ if self.use_valid_flag:
+ gt_valid_flag = info['valid_flag']
+ else:
+ gt_valid_flag = info['num_lidar_pts'] > 0
+
+ assert self.occ_filter_by_valid_flag is False
+ if self.occ_filter_by_valid_flag:
+ gt_bboxes_3d = gt_bboxes_3d[gt_valid_flag]
+ gt_names_3d = gt_names_3d[gt_valid_flag]
+ gt_ins_inds = gt_ins_inds[gt_valid_flag]
+ gt_vis_tokens = gt_vis_tokens[gt_valid_flag]
+
+ # cls_name to cls_id
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ if self.with_velocity:
+ gt_velocity = info['gt_velocity']
+ nan_mask = np.isnan(gt_velocity[:, 0])
+ gt_velocity[nan_mask] = [0.0, 0.0]
+ gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+ # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ # gt_names=gt_names_3d,
+ gt_inds=gt_ins_inds,
+ gt_vis_tokens=gt_vis_tokens,
+ )
+
+ return anns_results
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def _format_bbox(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+ Returns:
+ str: Path of the output json file.
+ """
+ nusc_annos = {}
+ nusc_map_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(track_iter_progress(results)):
+ annos = []
+ sample_token = self.data_infos[sample_id]['token']
+
+ if 'map' in self.eval_mod:
+ map_annos = {}
+ for key, value in det['ret_iou'].items():
+ map_annos[key] = float(value.numpy()[0])
+ nusc_map_annos[sample_token] = map_annos
+
+ if 'boxes_3d' not in det:
+ nusc_annos[sample_token] = annos
+ continue
+
+ boxes = output_to_nusc_box(det)
+ boxes_ego = copy.deepcopy(boxes)
+ boxes, keep_idx = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+ mapped_class_names,
+ self.eval_detection_configs,
+ self.eval_version)
+ for i, box in enumerate(boxes):
+ name = mapped_class_names[box.label]
+ if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+ if name in [
+ 'car',
+ 'construction_vehicle',
+ 'bus',
+ 'truck',
+ 'trailer',
+ ]:
+ attr = 'vehicle.moving'
+ elif name in ['bicycle', 'motorcycle']:
+ attr = 'cycle.with_rider'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+ else:
+ if name in ['pedestrian']:
+ attr = 'pedestrian.standing'
+ elif name in ['bus']:
+ attr = 'vehicle.stopped'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+
+ # center_ = box.center.tolist()
+ # change from ground height to center height
+ # center_[2] = center_[2] + (box.wlh.tolist()[2] / 2.0)
+ if name not in ['car', 'truck', 'bus', 'trailer', 'motorcycle',
+ 'bicycle', 'pedestrian', ]:
+ continue
+
+ box_ego = boxes_ego[keep_idx[i]]
+ trans = box_ego.center
+ if 'traj' in det:
+ traj_local = det['traj'][keep_idx[i]].numpy()[..., :2]
+ traj_scores = det['traj_scores'][keep_idx[i]].numpy()
+ else:
+ traj_local = np.zeros((0,))
+ traj_scores = np.zeros((0,))
+ traj_ego = np.zeros_like(traj_local)
+ rot = Quaternion(axis=np.array([0, 0.0, 1.0]), angle=np.pi/2)
+ for kk in range(traj_ego.shape[0]):
+ traj_ego[kk] = convert_local_coords_to_global(
+ traj_local[kk], trans, rot)
+
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box.center.tolist(),
+ size=box.wlh.tolist(),
+ rotation=box.orientation.elements.tolist(),
+ velocity=box.velocity[:2].tolist(),
+ detection_name=name,
+ detection_score=box.score,
+ attribute_name=attr,
+ tracking_name=name,
+ tracking_score=box.score,
+ tracking_id=box.token,
+ predict_traj=traj_ego,
+ predict_traj_score=traj_scores,
+ )
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ 'map_results': nusc_map_annos,
+ }
+
+ mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def format_results(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ # assert len(results) == len(self), (
+ # 'The length of results is not equal to the dataset len: {} != {}'.
+ # format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ result_files = self._format_bbox(results, jsonfile_prefix)
+
+ return result_files, tmp_dir
+
+ def _format_bbox_det(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+ Returns:
+ str: Path of the output json file.
+ """
+ nusc_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(track_iter_progress(results)):
+ annos = []
+ sample_token = self.data_infos[sample_id]['token']
+
+ if det is None:
+ nusc_annos[sample_token] = annos
+ continue
+
+ boxes = output_to_nusc_box_det(det)
+ boxes_ego = copy.deepcopy(boxes)
+ boxes, keep_idx = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+ mapped_class_names,
+ self.eval_detection_configs,
+ self.eval_version)
+ for i, box in enumerate(boxes):
+ name = mapped_class_names[box.label]
+ if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+ if name in [
+ 'car',
+ 'construction_vehicle',
+ 'bus',
+ 'truck',
+ 'trailer',
+ ]:
+ attr = 'vehicle.moving'
+ elif name in ['bicycle', 'motorcycle']:
+ attr = 'cycle.with_rider'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+ else:
+ if name in ['pedestrian']:
+ attr = 'pedestrian.standing'
+ elif name in ['bus']:
+ attr = 'vehicle.stopped'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box.center.tolist(),
+ size=box.wlh.tolist(),
+ rotation=box.orientation.elements.tolist(),
+ velocity=box.velocity[:2].tolist(),
+ detection_name=name,
+ detection_score=box.score,
+ attribute_name=attr,
+ )
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ }
+
+ mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_nusc_det.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def format_results_det(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ # assert len(results) == len(self), (
+ # 'The length of results is not equal to the dataset len: {} != {}'.
+ # format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results_det')
+ else:
+ tmp_dir = None
+
+ result_files = self._format_bbox_det(results, jsonfile_prefix)
+ return result_files, tmp_dir
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+ if isinstance(results, dict):
+ if 'occ_results_computed' in results.keys():
+ occ_results_computed = results['occ_results_computed']
+ out_metrics = ['iou']
+
+ # pan_eval
+ if occ_results_computed.get('pq', None) is not None:
+ out_metrics = ['iou', 'pq', 'sq', 'rq']
+
+ print("Occ-flow Val Results:")
+ for panoptic_key in out_metrics:
+ print(panoptic_key)
+ # HERE!! connect
+ print(' & '.join(
+ [f'{x:.1f}' for x in occ_results_computed[panoptic_key]]))
+
+ if 'num_occ' in occ_results_computed.keys() and 'ratio_occ' in occ_results_computed.keys():
+ print(
+ f"num occ evaluated:{occ_results_computed['num_occ']}")
+ print(
+ f"ratio occ evaluated: {occ_results_computed['ratio_occ'] * 100:.1f}%")
+ if 'planning_results_computed' in results.keys():
+ planning_results_computed = results['planning_results_computed']
+ planning_tab = PrettyTable()
+ planning_tab.field_names = [
+ "metrics", "0.5s", "1.0s", "1.5s", "2.0s", "2.5s", "3.0s"]
+ for key in planning_results_computed.keys():
+ value = planning_results_computed[key]
+ row_value = []
+ row_value.append(key)
+ for i in range(len(value)):
+ row_value.append('%.4f' % float(value[i]))
+ planning_tab.add_row(row_value)
+ print(planning_tab)
+ results = results['bbox_results'] # get bbox_results
+
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+ result_files_det, tmp_dir = self.format_results_det(
+ results, jsonfile_prefix)
+
+ if isinstance(result_files, dict):
+ results_dict = dict()
+ for name in result_names:
+ print('Evaluating bboxes of {}'.format(name))
+ ret_dict = self._evaluate_single(
+ result_files[name], result_files_det[name])
+ results_dict.update(ret_dict)
+ elif isinstance(result_files, str):
+ results_dict = self._evaluate_single(
+ result_files, result_files_det)
+
+ if 'map' in self.eval_mod:
+ drivable_intersection = 0
+ drivable_union = 0
+ lanes_intersection = 0
+ lanes_union = 0
+ divider_intersection = 0
+ divider_union = 0
+ crossing_intersection = 0
+ crossing_union = 0
+ contour_intersection = 0
+ contour_union = 0
+ for i in range(len(results)):
+ drivable_intersection += results[i]['ret_iou']['drivable_intersection']
+ drivable_union += results[i]['ret_iou']['drivable_union']
+ lanes_intersection += results[i]['ret_iou']['lanes_intersection']
+ lanes_union += results[i]['ret_iou']['lanes_union']
+ divider_intersection += results[i]['ret_iou']['divider_intersection']
+ divider_union += results[i]['ret_iou']['divider_union']
+ crossing_intersection += results[i]['ret_iou']['crossing_intersection']
+ crossing_union += results[i]['ret_iou']['crossing_union']
+ contour_intersection += results[i]['ret_iou']['contour_intersection']
+ contour_union += results[i]['ret_iou']['contour_union']
+ results_dict.update({'drivable_iou': float(drivable_intersection / drivable_union),
+ 'lanes_iou': float(lanes_intersection / lanes_union),
+ 'divider_iou': float(divider_intersection / divider_union),
+ 'crossing_iou': float(crossing_intersection / crossing_union),
+ 'contour_iou': float(contour_intersection / contour_union)})
+
+ print(results_dict)
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+ return results_dict
+
+ def _evaluate_single(self,
+ result_path,
+ result_path_det,
+ logger=None,
+ metric='bbox',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+
+ # TODO: fix the evaluation pipelines
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+ output_dir_det = osp.join(output_dir, 'det')
+ output_dir_track = osp.join(output_dir, 'track')
+ output_dir_motion = osp.join(output_dir, 'motion')
+ mkdir_or_exist(output_dir_det)
+ mkdir_or_exist(output_dir_track)
+ mkdir_or_exist(output_dir_motion)
+
+ eval_set_map = {
+ 'v1.0-mini': 'mini_val',
+ 'v1.0-trainval': 'val',
+ }
+ detail = dict()
+
+ if 'det' in self.eval_mod:
+ self.nusc_eval = NuScenesEval_custom(
+ self.nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path_det,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir_det,
+ verbose=True,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos
+ )
+ self.nusc_eval.main(plot_examples=0, render_curves=False)
+ # record metrics
+ metrics = load(
+ osp.join(
+ output_dir_det,
+ 'metrics_summary.json'))
+ metric_prefix = f'{result_name}_NuScenes'
+ for name in self.CLASSES:
+ for k, v in metrics['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(
+ metric_prefix, name, k)] = val
+ for k, v in metrics['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,
+ self.ErrNameMapping[k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+
+ if 'track' in self.eval_mod:
+ cfg = config_factory("tracking_nips_2019")
+ self.nusc_eval_track = TrackingEval(
+ config=cfg,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir_track,
+ verbose=True,
+ nusc_version=self.version,
+ nusc_dataroot=self.data_root
+ )
+ self.nusc_eval_track.main()
+ # record metrics
+ metrics = load(
+ osp.join(
+ output_dir_track,
+ 'metrics_summary.json'))
+ keys = ['amota', 'amotp', 'recall', 'motar',
+ 'gt', 'mota', 'motp', 'mt', 'ml', 'faf',
+ 'tp', 'fp', 'fn', 'ids', 'frag', 'tid', 'lgd']
+ for key in keys:
+ detail['{}/{}'.format(metric_prefix, key)] = metrics[key]
+
+ # if 'map' in self.eval_mod:
+ # for i, ret_iou in enumerate(ret_ious):
+ # detail['iou_{}'.format(i)] = ret_iou
+
+ if 'motion' in self.eval_mod:
+ self.nusc_eval_motion = MotionEval(
+ self.nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=True,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos,
+ category_convert_type='motion_category'
+ )
+ print('-'*50)
+ print(
+ 'Evaluate on motion category, merge class for vehicles and pedestrians...')
+ print('evaluate standard motion metrics...')
+ self.nusc_eval_motion.main(
+ plot_examples=0,
+ render_curves=False,
+ eval_mode='standard')
+ print('evaluate motion mAP-minFDE metrics...')
+ self.nusc_eval_motion.main(
+ plot_examples=0,
+ render_curves=False,
+ eval_mode='motion_map')
+ print('evaluate EPA motion metrics...')
+ self.nusc_eval_motion.main(
+ plot_examples=0,
+ render_curves=False,
+ eval_mode='epa')
+ print('-'*50)
+ print('Evaluate on detection category...')
+ self.nusc_eval_motion = MotionEval(
+ self.nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=True,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos,
+ category_convert_type='detection_category'
+ )
+ print('evaluate standard motion metrics...')
+ self.nusc_eval_motion.main(
+ plot_examples=0,
+ render_curves=False,
+ eval_mode='standard')
+ print('evaluate EPA motion metrics...')
+ self.nusc_eval_motion.main(
+ plot_examples=0,
+ render_curves=False,
+ eval_mode='motion_map')
+ print('evaluate EPA motion metrics...')
+ self.nusc_eval_motion.main(
+ plot_examples=0,
+ render_curves=False,
+ eval_mode='epa')
+
+ return detail
diff --git a/mmcv/datasets/nuscenes_eval.py b/mmcv/datasets/nuscenes_eval.py
new file mode 100644
index 0000000..a0dc0b7
--- /dev/null
+++ b/mmcv/datasets/nuscenes_eval.py
@@ -0,0 +1,752 @@
+import argparse
+import copy
+import json
+import os
+import time
+import cv2
+import argparse
+import random
+import tqdm
+import torch
+from typing import Tuple, Dict, Any
+from mmcv.fileio.io import dump,load
+from torchvision.transforms.functional import rotate
+import numpy as np
+from pyquaternion import Quaternion
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from nuscenes.eval.common.loaders import load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+ DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from matplotlib import pyplot as plt
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+ PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+import mmcv
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+ metrics: DetectionMetrics,
+ detection_name: str,
+ min_recall: float,
+ dist_th_tp: float,
+ savepath: str = None,
+ ax: Axis = None) -> None:
+ """
+ Plot the true positive curve for the specified class.
+ :param md_list: DetectionMetricDataList instance.
+ :param metrics: DetectionMetrics instance.
+ :param detection_name:
+ :param min_recall: Minimum recall value.
+ :param dist_th_tp: The distance threshold used to determine matches.
+ :param savepath: If given, saves the the rendering here instead of displaying.
+ :param ax: Axes onto which to render.
+ """
+ # Get metric data for given detection class with tp distance threshold.
+
+ md = md_list[(detection_name, dist_th_tp)]
+ min_recall_ind = round(100 * min_recall)
+ if min_recall_ind <= md.max_recall_ind:
+ # For traffic_cone and barrier only a subset of the metrics are plotted.
+ rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+ ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+ else:
+ ylimit = 1.0
+
+ # Prepare axis.
+ if ax is None:
+ ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+ min_recall=min_recall)
+ ax.set_ylim(0, ylimit)
+
+ # Plot the recall vs. error curve for each tp metric.
+ for metric in TP_METRICS:
+ tp = metrics.get_label_tp(detection_name, metric)
+
+ # Plot only if we have valid data.
+ if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+ recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+ else:
+ recall, error = [], []
+
+ # Change legend based on tp value
+ if tp is np.nan:
+ label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+ elif min_recall_ind > md.max_recall_ind:
+ label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+ else:
+ label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+ if metric == 'trans_err':
+ label += f' ({md.max_recall_ind})' # add recall
+ print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+ ax.plot(recall, error, label=label)
+ ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+ ax.legend(loc='best')
+
+ if savepath is not None:
+ plt.savefig(savepath)
+ plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+ def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+ '''
+ add annotation token
+ '''
+ super().__init__(*args, **kwargs)
+ self.token = token
+ self.visibility = visibility
+ self.index = index
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'token': self.token,
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'visibility': self.visibility,
+ 'index': self.index
+
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(
+ token=content['token'],
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'],
+ visibility=content['visibility'],
+ index=content['index'],
+ )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible inside an image without accounting for occlusions.
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ center_3d = box.center.reshape(3, 1)
+ center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, center_img[1, :] > 0)
+ visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+ in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if vis_level == BoxVisibility.ALL:
+ return all(visible) and all(in_front)
+ elif vis_level == BoxVisibility.ANY:
+ return any(visible) and all(in_front)
+ elif vis_level == BoxVisibility.NONE:
+ return True
+ else:
+ raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+ vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible in images but not all corners in image .
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ corners_3d = box.corners()
+ corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, corners_img[1, :] > 0)
+ visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+ in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if any(visible) and not all(visible) and all(in_front):
+ return True
+ else:
+ return False
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \
+ -> Tuple[EvalBoxes, Dict]:
+ """
+ Loads object predictions from file.
+ :param result_path: Path to the .json result file provided by the user.
+ :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The deserialized results and meta data.
+ """
+
+ # Load from file and check that the format is correct.
+ # with open(result_path) as f:
+ # data = json.load(f)
+ data = load(result_path)
+ assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+ 'See https://www.nuscenes.org/object-detection for more information.'
+
+ # Deserialize results and get meta data.
+ all_results = EvalBoxes.deserialize(data['results'], box_cls)
+ meta = data['meta']
+ if verbose:
+ print("Loaded results from {}. Found detections for {} samples."
+ .format(result_path, len(all_results.sample_tokens)))
+
+ # Check that each sample has no more than x predicted boxes.
+ for sample_token in all_results.sample_tokens:
+ assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+ "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+ return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+ """
+ Loads ground truth boxes from DB.
+ :param nusc: A NuScenes instance.
+ :param eval_split: The evaluation split for which we load GT boxes.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The GT boxes.
+ """
+
+ # Init.
+ if box_cls == DetectionBox_modified:
+ attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+ if verbose:
+ print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+ # Read out all sample_tokens in DB.
+ sample_tokens_all = [s['token'] for s in nusc.sample]
+ assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+ # Only keep samples from this split.
+ splits = create_splits_scenes()
+
+ # Check compatibility of split with nusc_version.
+ version = nusc.version
+ if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+ assert version.endswith('trainval'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split in {'mini_train', 'mini_val'}:
+ assert version.endswith('mini'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split == 'test':
+ assert version.endswith('test'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ else:
+ raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+ .format(eval_split))
+
+ if eval_split == 'test':
+ # Check that you aren't trying to cheat :).
+ assert len(nusc.sample_annotation) > 0, \
+ 'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+ index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ index_map[sample['token']] = index
+ index += 1
+
+ sample_tokens = []
+ for sample_token in sample_tokens_all:
+ scene_token = nusc.get('sample', sample_token)['scene_token']
+ scene_record = nusc.get('scene', scene_token)
+ if scene_record['name'] in splits[eval_split]:
+ sample_tokens.append(sample_token)
+
+ all_annotations = EvalBoxes()
+
+ # Load annotations and filter predictions and annotations.
+ tracking_id_set = set()
+ for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+ sample = nusc.get('sample', sample_token)
+ sample_annotation_tokens = sample['anns']
+
+ sample_boxes = []
+ for sample_annotation_token in sample_annotation_tokens:
+
+ sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+ if box_cls == DetectionBox_modified:
+ # Get label name in detection task and filter unused labels.
+ detection_name = category_to_detection_name(sample_annotation['category_name'])
+ if detection_name is None:
+ continue
+
+ # Get attribute_name.
+ attr_tokens = sample_annotation['attribute_tokens']
+ attr_count = len(attr_tokens)
+ if attr_count == 0:
+ attribute_name = ''
+ elif attr_count == 1:
+ attribute_name = attribute_map[attr_tokens[0]]
+ else:
+ raise Exception('Error: GT annotations must not have more than one attribute!')
+
+ sample_boxes.append(
+ box_cls(
+ token=sample_annotation_token,
+ sample_token=sample_token,
+ translation=sample_annotation['translation'],
+ size=sample_annotation['size'],
+ rotation=sample_annotation['rotation'],
+ velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+ num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+ detection_name=detection_name,
+ detection_score=-1.0, # GT samples do not have a score.
+ attribute_name=attribute_name,
+ visibility=sample_annotation['visibility_token'],
+ index=index_map[sample_token]
+ )
+ )
+ elif box_cls == TrackingBox:
+ assert False
+ else:
+ raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+ all_annotations.add_boxes(sample_token, sample_boxes)
+
+ if verbose:
+ print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+ return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ id=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.token in id:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+ ori_eval_boxes: EvalBoxes,
+ visibility=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.visibility == visibility:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After visibility based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False):
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ for sample_token in eval_boxes.sample_tokens:
+ if sample_token not in valid_sample_tokens:
+ eval_boxes.boxes.pop(sample_token)
+ return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. basedon overlap .
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ sample_record = nusc.get('sample', sample_token)
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ count = 0
+ for cam in cams:
+ '''
+ copy-paste form nuscens
+ '''
+ sample_data_token = sample_record['data'][cam]
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+ name=box.detection_name, token='')
+
+ # Move box to ego vehicle coord system.
+ new_box.translate(-np.array(pose_record['translation']))
+ new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ new_box.translate(-np.array(cs_record['translation']))
+ new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ count += 1
+ # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ # count += 1
+
+ if count > 1:
+ with open('center_overlap.txt', 'a') as f:
+ try:
+ f.write(box.token + '\n')
+ except:
+ pass
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ verbose = True
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+ """
+ Dummy class for backward-compatibility. Same as DetectionEval.
+ """
+
+ def __init__(self,
+ nusc: NuScenes,
+ config: DetectionConfig,
+ result_path: str,
+ eval_set: str,
+ output_dir: str = None,
+ verbose: bool = True,
+ overlap_test=False,
+ eval_mask=False,
+ data_infos=None
+ ):
+ """
+ Initialize a DetectionEval object.
+ :param nusc: A NuScenes object.
+ :param config: A DetectionConfig object.
+ :param result_path: Path of the nuScenes JSON result file.
+ :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+ :param output_dir: Folder to save plots and results to.
+ :param verbose: Whether to print to stdout.
+ """
+
+ self.nusc = nusc
+ self.result_path = result_path
+ self.eval_set = eval_set
+ self.output_dir = output_dir
+ self.verbose = verbose
+ self.cfg = config
+ self.overlap_test = overlap_test
+ self.eval_mask = eval_mask
+ self.data_infos = data_infos
+ # Check result file exists.
+ assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+ # Make dirs.
+ self.plot_dir = os.path.join(self.output_dir, 'plots')
+ if not os.path.isdir(self.output_dir):
+ os.makedirs(self.output_dir)
+ if not os.path.isdir(self.plot_dir):
+ os.makedirs(self.plot_dir)
+
+ # Load data.
+ if verbose:
+ print('Initializing nuScenes detection evaluation')
+ self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+ verbose=verbose)
+ self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+ assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+ "Samples in split doesn't match samples in predictions."
+
+ # Add center distances.
+ self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+ self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+ # Filter boxes (distance, points per box, etc.).
+
+ if verbose:
+ print('Filtering predictions')
+ self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+ if verbose:
+ print('Filtering ground truth annotations')
+ self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+ if self.overlap_test:
+ self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+ self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+ self.all_gt = copy.deepcopy(self.gt_boxes)
+ self.all_preds = copy.deepcopy(self.pred_boxes)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+ self.index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ self.index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ self.index_map[sample['token']] = index
+ index += 1
+
+ def update_gt(self, type_='vis', visibility='1', index=1):
+ if type_ == 'vis':
+ self.visibility_test = True
+ if self.visibility_test:
+ '''[{'description': 'visibility of whole object is between 0 and 40%',
+ 'token': '1',
+ 'level': 'v0-40'},
+ {'description': 'visibility of whole object is between 40 and 60%',
+ 'token': '2',
+ 'level': 'v40-60'},
+ {'description': 'visibility of whole object is between 60 and 80%',
+ 'token': '3',
+ 'level': 'v60-80'},
+ {'description': 'visibility of whole object is between 80 and 100%',
+ 'token': '4',
+ 'level': 'v80-100'}]'''
+
+ self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+ elif type_ == 'ord':
+
+ valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+ # from IPython import embed
+ # embed()
+ self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+ self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+ def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMetricDataList()
+
+ # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+ # self.cfg.dist_ths = [0.3]
+ # self.cfg.dist_fcn_callable
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMetrics(self.cfg)
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+ """
+ Renders various PR and TP curves.
+ :param metrics: DetectionMetrics instance.
+ :param md_list: DetectionMetricDataList instance.
+ """
+ if self.verbose:
+ print('Rendering PR and TP curves')
+
+ def savepath(name):
+ return os.path.join(self.plot_dir, name + '.pdf')
+
+ summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+ dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+ for detection_name in self.cfg.class_names:
+ class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath(detection_name + '_pr'))
+
+ class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+ savepath=savepath(detection_name + '_tp'))
+
+ for dist_th in self.cfg.dist_ths:
+ dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+ # Settings.
+ parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+ parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+ help='Folder to store result metrics, graphs and example visualizations.')
+ parser.add_argument('--eval_set', type=str, default='val',
+ help='Which dataset split to evaluate on, train, val or test.')
+ parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+ help='Default nuScenes data directory.')
+ parser.add_argument('--version', type=str, default='v1.0-trainval',
+ help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+ parser.add_argument('--config_path', type=str, default='',
+ help='Path to the configuration file.'
+ 'If no path given, the CVPR 2019 configuration will be used.')
+ parser.add_argument('--plot_examples', type=int, default=0,
+ help='How many example visualizations to write to disk.')
+ parser.add_argument('--render_curves', type=int, default=1,
+ help='Whether to render PR and TP curves to disk.')
+ parser.add_argument('--verbose', type=int, default=1,
+ help='Whether to print to stdout.')
+ args = parser.parse_args()
+
+ result_path_ = os.path.expanduser(args.result_path)
+ output_dir_ = os.path.expanduser(args.output_dir)
+ eval_set_ = args.eval_set
+ dataroot_ = args.dataroot
+ version_ = args.version
+ config_path = args.config_path
+ plot_examples_ = args.plot_examples
+ render_curves_ = bool(args.render_curves)
+ verbose_ = bool(args.verbose)
+
+ if config_path == '':
+ cfg_ = config_factory('detection_cvpr_2019')
+ else:
+ with open(config_path, 'r') as _f:
+ cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+ nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+ nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+ output_dir=output_dir_, verbose=verbose_)
+ for vis in ['1', '2', '3', '4']:
+ nusc_eval.update_gt(type_='vis', visibility=vis)
+ print(f'================ {vis} ===============')
+ nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+ #for index in range(1, 41):
+ # nusc_eval.update_gt(type_='ord', index=index)
+ #
diff --git a/mmcv/datasets/nuscenes_mono_dataset.py b/mmcv/datasets/nuscenes_mono_dataset.py
new file mode 100644
index 0000000..b036b87
--- /dev/null
+++ b/mmcv/datasets/nuscenes_mono_dataset.py
@@ -0,0 +1,777 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import mmcv
+import numpy as np
+import pyquaternion
+import tempfile
+import torch
+import warnings
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet.datasets import DATASETS, CocoDataset
+from mmdet3d.core import show_multi_modality_result
+from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type
+from mmdet3d.datasets.pipelines import Compose
+from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+class CustomNuScenesMonoDataset(CocoDataset):
+ r"""Monocular 3D detection on NuScenes Dataset.
+ This class serves as the API for experiments on the NuScenes Dataset.
+ Please refer to `NuScenes Dataset `_
+ for data downloading.
+ Args:
+ ann_file (str): Path of annotation file.
+ data_root (str): Path of dataset root.
+ load_interval (int, optional): Interval of loading the dataset. It is
+ used to uniformly sample the dataset. Defaults to 1.
+ with_velocity (bool, optional): Whether include velocity prediction
+ into the experiments. Defaults to True.
+ modality (dict, optional): Modality to specify the sensor data used
+ as input. Defaults to None.
+ box_type_3d (str, optional): Type of 3D box of this dataset.
+ Based on the `box_type_3d`, the dataset will encapsulate the box
+ to its original format then converted them to `box_type_3d`.
+ Defaults to 'Camera' in this class. Available options includes.
+ - 'LiDAR': Box in LiDAR coordinates.
+ - 'Depth': Box in depth coordinates, usually for indoor dataset.
+ - 'Camera': Box in camera coordinates.
+ eval_version (str, optional): Configuration version of evaluation.
+ Defaults to 'detection_cvpr_2019'.
+ use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+ file as mask to filter gt_boxes and gt_names. Defaults to False.
+ version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
+ """
+ CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+ 'barrier')
+ DefaultAttribute = {
+ 'car': 'vehicle.parked',
+ 'pedestrian': 'pedestrian.moving',
+ 'trailer': 'vehicle.parked',
+ 'truck': 'vehicle.parked',
+ 'bus': 'vehicle.moving',
+ 'motorcycle': 'cycle.without_rider',
+ 'construction_vehicle': 'vehicle.parked',
+ 'bicycle': 'cycle.without_rider',
+ 'barrier': '',
+ 'traffic_cone': '',
+ }
+ # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+ ErrNameMapping = {
+ 'trans_err': 'mATE',
+ 'scale_err': 'mASE',
+ 'orient_err': 'mAOE',
+ 'vel_err': 'mAVE',
+ 'attr_err': 'mAAE'
+ }
+
+ def __init__(self,
+ data_root,
+ load_interval=1,
+ with_velocity=True,
+ modality=None,
+ box_type_3d='Camera',
+ eval_version='detection_cvpr_2019',
+ use_valid_flag=False,
+ overlap_test=False,
+ version='v1.0-trainval',
+ **kwargs):
+ super().__init__(**kwargs)
+ # overlap_test = True
+ self.data_root = data_root
+ self.overlap_test = overlap_test
+ self.load_interval = load_interval
+ self.with_velocity = with_velocity
+ self.modality = modality
+ self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+ self.eval_version = eval_version
+ self.use_valid_flag = use_valid_flag
+ self.bbox_code_size = 9
+ self.version = version
+ if self.eval_version is not None:
+ from nuscenes.eval.detection.config import config_factory
+ self.eval_detection_configs = config_factory(self.eval_version)
+ if self.modality is None:
+ self.modality = dict(
+ use_camera=True,
+ use_lidar=False,
+ use_radar=False,
+ use_map=False,
+ use_external=False)
+
+ def pre_pipeline(self, results):
+ """Initialization before data preparation.
+ Args:
+ results (dict): Dict before data preprocessing.
+ - img_fields (list): Image fields.
+ - bbox3d_fields (list): 3D bounding boxes fields.
+ - pts_mask_fields (list): Mask fields of points.
+ - pts_seg_fields (list): Mask fields of point segments.
+ - bbox_fields (list): Fields of bounding boxes.
+ - mask_fields (list): Fields of masks.
+ - seg_fields (list): Segment fields.
+ - box_type_3d (str): 3D box type.
+ - box_mode_3d (str): 3D box mode.
+ """
+ results['img_prefix'] = '' # self.img_prefix
+ # print('img_prefix', self.img_prefix)
+ results['seg_prefix'] = self.seg_prefix
+ results['proposal_file'] = self.proposal_file
+ results['img_fields'] = []
+ results['bbox3d_fields'] = []
+ results['pts_mask_fields'] = []
+ results['pts_seg_fields'] = []
+ results['bbox_fields'] = []
+ results['mask_fields'] = []
+ results['seg_fields'] = []
+ results['box_type_3d'] = self.box_type_3d
+ results['box_mode_3d'] = self.box_mode_3d
+
+ def _parse_ann_info(self, img_info, ann_info):
+ """Parse bbox annotation.
+ Args:
+ img_info (list[dict]): Image info.
+ ann_info (list[dict]): Annotation info of an image.
+ Returns:
+ dict: A dict containing the following keys: bboxes, labels, \
+ gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \
+ depths, bboxes_ignore, masks, seg_map
+ """
+ gt_bboxes = []
+ gt_labels = []
+ attr_labels = []
+ gt_bboxes_ignore = []
+ gt_masks_ann = []
+ gt_bboxes_cam3d = []
+ centers2d = []
+ depths = []
+ for i, ann in enumerate(ann_info):
+ if ann.get('ignore', False):
+ continue
+ x1, y1, w, h = ann['bbox']
+ inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+ inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+ if inter_w * inter_h == 0:
+ continue
+ if ann['area'] <= 0 or w < 1 or h < 1:
+ continue
+ if ann['category_id'] not in self.cat_ids:
+ continue
+ bbox = [x1, y1, x1 + w, y1 + h]
+ if ann.get('iscrowd', False):
+ gt_bboxes_ignore.append(bbox)
+ else:
+ gt_bboxes.append(bbox)
+ gt_labels.append(self.cat2label[ann['category_id']])
+ attr_labels.append(ann['attribute_id'])
+ gt_masks_ann.append(ann.get('segmentation', None))
+ # 3D annotations in camera coordinates
+ bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
+ velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
+ nan_mask = np.isnan(velo_cam3d[:, 0])
+ velo_cam3d[nan_mask] = [0.0, 0.0]
+ bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
+ gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
+ # 2.5D annotations in camera coordinates
+ center2d = ann['center2d'][:2]
+ depth = ann['center2d'][2]
+ centers2d.append(center2d)
+ depths.append(depth)
+
+ if gt_bboxes:
+ gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+ gt_labels = np.array(gt_labels, dtype=np.int64)
+ attr_labels = np.array(attr_labels, dtype=np.int64)
+ else:
+ gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+ gt_labels = np.array([], dtype=np.int64)
+ attr_labels = np.array([], dtype=np.int64)
+
+ if gt_bboxes_cam3d:
+ gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+ centers2d = np.array(centers2d, dtype=np.float32)
+ depths = np.array(depths, dtype=np.float32)
+ else:
+ gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+ dtype=np.float32)
+ centers2d = np.zeros((0, 2), dtype=np.float32)
+ depths = np.zeros((0), dtype=np.float32)
+
+ gt_bboxes_cam3d = CameraInstance3DBoxes(
+ gt_bboxes_cam3d,
+ box_dim=gt_bboxes_cam3d.shape[-1],
+ origin=(0.5, 0.5, 0.5))
+ gt_labels_3d = copy.deepcopy(gt_labels)
+
+ if gt_bboxes_ignore:
+ gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+ else:
+ gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+ seg_map = img_info['filename'].replace('jpg', 'png')
+
+ ann = dict(
+ bboxes=gt_bboxes,
+ labels=gt_labels,
+ gt_bboxes_3d=gt_bboxes_cam3d,
+ gt_labels_3d=gt_labels_3d,
+ attr_labels=attr_labels,
+ centers2d=centers2d,
+ depths=depths,
+ bboxes_ignore=gt_bboxes_ignore,
+ masks=gt_masks_ann,
+ seg_map=seg_map)
+
+ return ann
+
+ def get_attr_name(self, attr_idx, label_name):
+ """Get attribute from predicted index.
+ This is a workaround to predict attribute when the predicted velocity
+ is not reliable. We map the predicted attribute index to the one
+ in the attribute set. If it is consistent with the category, we will
+ keep it. Otherwise, we will use the default attribute.
+ Args:
+ attr_idx (int): Attribute index.
+ label_name (str): Predicted category name.
+ Returns:
+ str: Predicted attribute name.
+ """
+ # TODO: Simplify the variable name
+ AttrMapping_rev2 = [
+ 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+ 'pedestrian.standing', 'pedestrian.sitting_lying_down',
+ 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+ ]
+ if label_name == 'car' or label_name == 'bus' \
+ or label_name == 'truck' or label_name == 'trailer' \
+ or label_name == 'construction_vehicle':
+ if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+ AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+ AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+ return AttrMapping_rev2[attr_idx]
+ else:
+ return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+ elif label_name == 'pedestrian':
+ if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+ AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+ AttrMapping_rev2[attr_idx] == \
+ 'pedestrian.sitting_lying_down':
+ return AttrMapping_rev2[attr_idx]
+ else:
+ return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+ elif label_name == 'bicycle' or label_name == 'motorcycle':
+ if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+ AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+ return AttrMapping_rev2[attr_idx]
+ else:
+ return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+ else:
+ return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+
+ def _format_bbox(self, results, jsonfile_prefix=None):
+ """Convert the results to the standard format.
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+ Returns:
+ str: Path of the output json file.
+ """
+ nusc_annos = {}
+ mapped_class_names = self.CLASSES
+
+ print('Start to convert detection format...')
+
+ CAM_NUM = 6
+
+ for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+
+ if sample_id % CAM_NUM == 0:
+ boxes_per_frame = []
+ attrs_per_frame = []
+
+ # need to merge results from images of the same sample
+ annos = []
+ boxes, attrs = output_to_nusc_box(det)
+ sample_token = self.data_infos[sample_id]['token']
+ boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
+ boxes, attrs,
+ mapped_class_names,
+ self.eval_detection_configs,
+ self.eval_version)
+
+ boxes_per_frame.extend(boxes)
+ attrs_per_frame.extend(attrs)
+ # Remove redundant predictions caused by overlap of images
+ if (sample_id + 1) % CAM_NUM != 0:
+ continue
+ boxes = global_nusc_box_to_cam(
+ self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+ mapped_class_names, self.eval_detection_configs,
+ self.eval_version)
+ cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+ # box nms 3d over 6 images in a frame
+ # TODO: move this global setting into config
+ nms_cfg = dict(
+ use_rotate_nms=True,
+ nms_across_levels=False,
+ nms_pre=4096,
+ nms_thr=0.05,
+ score_thr=0.01,
+ min_bbox_size=0,
+ max_per_frame=500)
+ from mmcv import Config
+ nms_cfg = Config(nms_cfg)
+ cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+ boxes3d = cam_boxes3d.tensor
+ # generate attr scores from attr labels
+ attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+ boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+ boxes3d,
+ cam_boxes3d_for_nms,
+ scores,
+ nms_cfg.score_thr,
+ nms_cfg.max_per_frame,
+ nms_cfg,
+ mlvl_attr_scores=attrs)
+ cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+ det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+ boxes, attrs = output_to_nusc_box(det)
+ boxes, attrs = cam_nusc_box_to_global(
+ self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+ mapped_class_names, self.eval_detection_configs,
+ self.eval_version)
+
+ for i, box in enumerate(boxes):
+ name = mapped_class_names[box.label]
+ attr = self.get_attr_name(attrs[i], name)
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box.center.tolist(),
+ size=box.wlh.tolist(),
+ rotation=box.orientation.elements.tolist(),
+ velocity=box.velocity[:2].tolist(),
+ detection_name=name,
+ detection_score=box.score,
+ attribute_name=attr)
+ annos.append(nusc_anno)
+ # other views results of the same frame should be concatenated
+ if sample_token in nusc_annos:
+ nusc_annos[sample_token].extend(annos)
+ else:
+ nusc_annos[sample_token] = annos
+
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ }
+
+ mmcv.mkdir_or_exist(jsonfile_prefix)
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ mmcv.dump(nusc_submissions, res_path)
+ return res_path
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ result_name='img_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'img_bbox'.
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+ from nuscenes import NuScenes
+ #from nuscenes.eval.detection.evaluate import NuScenesEval
+ from .nuscnes_eval import NuScenesEval_custom
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+ self.nusc = NuScenes(
+ version=self.version, dataroot=self.data_root, verbose=False)
+ eval_set_map = {
+ 'v1.0-mini': 'mini_val',
+ 'v1.0-trainval': 'val',
+ }
+ # nusc_eval = NuScenesEval(
+ # nusc,
+ # config=self.eval_detection_configs,
+ # result_path=result_path,
+ # eval_set=eval_set_map[self.version],
+ # output_dir=output_dir,
+ # verbose=False)
+ self.nusc_eval = NuScenesEval_custom(
+ self.nusc,
+ config=self.eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=True,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos
+ )
+
+ self.nusc_eval.main(render_curves=True)
+
+ # record metrics
+ metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+ detail = dict()
+ metric_prefix = f'{result_name}_NuScenes'
+ for name in self.CLASSES:
+ for k, v in metrics['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,
+ self.ErrNameMapping[k])] = val
+
+ detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+ return detail
+
+ def format_results(self, results, jsonfile_prefix=None, **kwargs):
+ """Format the results to json (standard format for COCO evaluation).
+ Args:
+ results (list[tuple | numpy.ndarray]): Testing results of the
+ dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ Returns:
+ tuple: (result_files, tmp_dir), result_files is a dict containing \
+ the json filepaths, tmp_dir is the temporal directory created \
+ for saving json files when jsonfile_prefix is not specified.
+ """
+ assert isinstance(results, list), 'results must be a list'
+ assert len(results) == len(self), (
+ 'The length of results is not equal to the dataset len: {} != {}'.
+ format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ # currently the output prediction results could be in two formats
+ # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+ # 2. list of dict('pts_bbox' or 'img_bbox':
+ # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+ # this is a workaround to enable evaluation of both formats on nuScenes
+ # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ # not evaluate 2D predictions on nuScenes
+ if '2d' in name:
+ continue
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+
+ return result_files, tmp_dir
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['img_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+ if isinstance(result_files, dict):
+ results_dict = dict()
+ for name in result_names:
+ print('Evaluating bboxes of {}'.format(name))
+ ret_dict = self._evaluate_single(result_files[name])
+ results_dict.update(ret_dict)
+ elif isinstance(result_files, str):
+ results_dict = self._evaluate_single(result_files)
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+ return results_dict
+
+ def _extract_data(self, index, pipeline, key, load_annos=False):
+ """Load data using input pipeline and extract data according to key.
+ Args:
+ index (int): Index for accessing the target data.
+ pipeline (:obj:`Compose`): Composed data loading pipeline.
+ key (str | list[str]): One single or a list of data key.
+ load_annos (bool): Whether to load data annotations.
+ If True, need to set self.test_mode as False before loading.
+ Returns:
+ np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+ A single or a list of loaded data.
+ """
+ assert pipeline is not None, 'data loading pipeline is not provided'
+ img_info = self.data_infos[index]
+ input_dict = dict(img_info=img_info)
+
+ if load_annos:
+ ann_info = self.get_ann_info(index)
+ input_dict.update(dict(ann_info=ann_info))
+
+ self.pre_pipeline(input_dict)
+ example = pipeline(input_dict)
+
+ # extract data items according to keys
+ if isinstance(key, str):
+ data = extract_result_dict(example, key)
+ else:
+ data = [extract_result_dict(example, k) for k in key]
+
+ return data
+
+ def _get_pipeline(self, pipeline):
+ """Get data loading pipeline in self.show/evaluate function.
+ Args:
+ pipeline (list[dict] | None): Input pipeline. If None is given, \
+ get from self.pipeline.
+ """
+ if pipeline is None:
+ if not hasattr(self, 'pipeline') or self.pipeline is None:
+ warnings.warn(
+ 'Use default pipeline for data loading, this may cause '
+ 'errors when data is on ceph')
+ return self._build_default_pipeline()
+ loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+ return Compose(loading_pipeline)
+ return Compose(pipeline)
+
+ def _build_default_pipeline(self):
+ """Build the default pipeline for this dataset."""
+ pipeline = [
+ dict(type='LoadImageFromFileMono3D'),
+ dict(
+ type='DefaultFormatBundle3D',
+ class_names=self.CLASSES,
+ with_label=False),
+ dict(type='Collect3D', keys=['img'])
+ ]
+ return Compose(pipeline)
+
+ def show(self, results, out_dir, show=True, pipeline=None):
+ """Results visualization.
+ Args:
+ results (list[dict]): List of bounding boxes results.
+ out_dir (str): Output directory of visualization result.
+ show (bool): Visualize the results online.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+ """
+ assert out_dir is not None, 'Expect out_dir, got none.'
+ pipeline = self._get_pipeline(pipeline)
+ for i, result in enumerate(results):
+ if 'img_bbox' in result.keys():
+ result = result['img_bbox']
+ data_info = self.data_infos[i]
+ img_path = data_info['file_name']
+ file_name = osp.split(img_path)[-1].split('.')[0]
+ img, img_metas = self._extract_data(i, pipeline,
+ ['img', 'img_metas'])
+ # need to transpose channel to first dim
+ img = img.numpy().transpose(1, 2, 0)
+ gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d']
+ pred_bboxes = result['boxes_3d']
+ show_multi_modality_result(
+ img,
+ gt_bboxes,
+ pred_bboxes,
+ img_metas['cam2img'],
+ out_dir,
+ file_name,
+ box_mode='camera',
+ show=show)
+
+
+def output_to_nusc_box(detection):
+ """Convert the output to the box class in the nuScenes.
+ Args:
+ detection (dict): Detection results.
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+ - attrs_3d (torch.Tensor, optional): Predicted attributes.
+ Returns:
+ list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+ """
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+ attrs = None
+ if 'attrs_3d' in detection:
+ attrs = detection['attrs_3d'].numpy()
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+
+ # convert the dim/rot to nuscbox convention
+ box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]]
+ box_yaw = -box_yaw
+
+ box_list = []
+ for i in range(len(box3d)):
+ q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+ quat = q2 * q1
+ velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
+ box = NuScenesBox(
+ box_gravity_center[i],
+ box_dims[i],
+ quat,
+ label=labels[i],
+ score=scores[i],
+ velocity=velocity)
+ box_list.append(box)
+ return box_list, attrs
+
+
+def cam_nusc_box_to_global(info,
+ boxes,
+ attrs,
+ classes,
+ eval_configs,
+ eval_version='detection_cvpr_2019'):
+ """Convert the box from camera to global coordinate.
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ classes (list[str]): Mapped classes in the evaluation.
+ eval_configs (object): Evaluation configuration object.
+ eval_version (str): Evaluation version.
+ Default: 'detection_cvpr_2019'
+ Returns:
+ list: List of standard NuScenesBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ attr_list = []
+ for (box, attr) in zip(boxes, attrs):
+ # Move box to ego vehicle coord system
+ box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
+ box.translate(np.array(info['cam2ego_translation']))
+ # filter det in ego.
+ cls_range_map = eval_configs.class_range
+ radius = np.linalg.norm(box.center[:2], 2)
+ det_range = cls_range_map[classes[box.label]]
+ if radius > det_range:
+ continue
+ # Move box to global coord system
+ box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+ box.translate(np.array(info['ego2global_translation']))
+ box_list.append(box)
+ attr_list.append(attr)
+ return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info,
+ boxes,
+ classes,
+ eval_configs,
+ eval_version='detection_cvpr_2019'):
+ """Convert the box from global to camera coordinate.
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ classes (list[str]): Mapped classes in the evaluation.
+ eval_configs (object): Evaluation configuration object.
+ eval_version (str): Evaluation version.
+ Default: 'detection_cvpr_2019'
+ Returns:
+ list: List of standard NuScenesBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ for box in boxes:
+ # Move box to ego vehicle coord system
+ box.translate(-np.array(info['ego2global_translation']))
+ box.rotate(
+ pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
+ # filter det in ego.
+ cls_range_map = eval_configs.class_range
+ radius = np.linalg.norm(box.center[:2], 2)
+ det_range = cls_range_map[classes[box.label]]
+ if radius > det_range:
+ continue
+ # Move box to camera coord system
+ box.translate(-np.array(info['cam2ego_translation']))
+ box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
+ box_list.append(box)
+ return box_list
+
+
+def nusc_box_to_cam_box3d(boxes):
+ """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+ Args:
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ Returns:
+ tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \
+ Converted 3D bounding boxes, scores and labels.
+ """
+ locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+ dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+ rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+ for b in boxes]).view(-1, 1)
+ velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2)
+
+ # convert nusbox to cambox convention
+ dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+ rots = -rots
+
+ boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+ cam_boxes3d = CameraInstance3DBoxes(
+ boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+ scores = torch.Tensor([b.score for b in boxes]).cuda()
+ labels = torch.LongTensor([b.label for b in boxes]).cuda()
+ nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+ indices = labels.new_tensor(list(range(scores.shape[0])))
+ nms_scores[indices, labels] = scores
+ return cam_boxes3d, nms_scores, labels
\ No newline at end of file
diff --git a/mmcv/datasets/nuscenes_styled_eval_utils.py b/mmcv/datasets/nuscenes_styled_eval_utils.py
new file mode 100644
index 0000000..a8053aa
--- /dev/null
+++ b/mmcv/datasets/nuscenes_styled_eval_utils.py
@@ -0,0 +1,755 @@
+from collections import defaultdict
+from typing import List, Dict, Tuple, Union, Callable
+import abc
+import numpy as np
+from pyquaternion import Quaternion
+
+
+def center_distance(gt_box, pred_box) -> float:
+ """
+ L2 distance between the box centers (xy only).
+ :param gt_box: GT annotation sample.
+ :param pred_box: Predicted sample.
+ :return: L2 distance.
+ """
+ return np.linalg.norm(np.array(pred_box.translation[:2]) - np.array(gt_box.translation[:2]))
+
+
+def velocity_l2(gt_box, pred_box) -> float:
+ """
+ L2 distance between the velocity vectors (xy only).
+ If the predicted velocities are nan, we return inf, which is subsequently clipped to 1.
+ :param gt_box: GT annotation sample.
+ :param pred_box: Predicted sample.
+ :return: L2 distance.
+ """
+ return np.linalg.norm(np.array(pred_box.velocity) - np.array(gt_box.velocity))
+
+
+def yaw_diff(gt_box, eval_box, period: float = 2*np.pi) -> float:
+ """
+ Returns the yaw angle difference between the orientation of two boxes.
+ :param gt_box: Ground truth box.
+ :param eval_box: Predicted box.
+ :param period: Periodicity in radians for assessing angle difference.
+ :return: Yaw angle difference in radians in [0, pi].
+ """
+ yaw_gt = quaternion_yaw(Quaternion(gt_box.rotation))
+ yaw_est = quaternion_yaw(Quaternion(eval_box.rotation))
+
+ return abs(angle_diff(yaw_gt, yaw_est, period))
+
+
+def angle_diff(x: float, y: float, period: float) -> float:
+ """
+ Get the smallest angle difference between 2 angles: the angle from y to x.
+ :param x: To angle.
+ :param y: From angle.
+ :param period: Periodicity in radians for assessing angle difference.
+ :return: . Signed smallest between-angle difference in range (-pi, pi).
+ """
+
+ # calculate angle difference, modulo to [0, 2*pi]
+ diff = (x - y + period / 2) % period - period / 2
+ if diff > np.pi:
+ diff = diff - (2 * np.pi) # shift (pi, 2*pi] to (-pi, 0]
+
+ return diff
+
+
+def attr_acc(gt_box, pred_box) -> float:
+ """
+ Computes the classification accuracy for the attribute of this class (if any).
+ If the GT class has no attributes or the annotation is missing attributes, we assign an accuracy of nan, which is
+ ignored later on.
+ :param gt_box: GT annotation sample.
+ :param pred_box: Predicted sample.
+ :return: Attribute classification accuracy (0 or 1) or nan if GT annotation does not have any attributes.
+ """
+ if gt_box.attribute_name == '':
+ # If the class does not have attributes or this particular sample is missing attributes, return nan, which is
+ # ignored later. Note that about 0.4% of the sample_annotations have no attributes, although they should.
+ acc = np.nan
+ else:
+ # Check that label is correct.
+ acc = float(gt_box.attribute_name == pred_box.attribute_name)
+ return acc
+
+
+def scale_iou(sample_annotation, sample_result) -> float:
+ """
+ This method compares predictions to the ground truth in terms of scale.
+ It is equivalent to intersection over union (IOU) between the two boxes in 3D,
+ if we assume that the boxes are aligned, i.e. translation and rotation are considered identical.
+ :param sample_annotation: GT annotation sample.
+ :param sample_result: Predicted sample.
+ :return: Scale IOU.
+ """
+ # Validate inputs.
+ sa_size = np.array(sample_annotation.size)
+ sr_size = np.array(sample_result.size)
+ assert all(sa_size > 0), 'Error: sample_annotation sizes must be >0.'
+ assert all(sr_size > 0), 'Error: sample_result sizes must be >0.'
+
+ # Compute IOU.
+ min_wlh = np.minimum(sa_size, sr_size)
+ volume_annotation = np.prod(sa_size)
+ volume_result = np.prod(sr_size)
+ intersection = np.prod(min_wlh) # type: float
+ union = volume_annotation + volume_result - intersection # type: float
+ iou = intersection / union
+
+ return iou
+
+
+def quaternion_yaw(q: Quaternion) -> float:
+ """
+ Calculate the yaw angle from a quaternion.
+ Note that this only works for a quaternion that represents a box in lidar or global coordinate frame.
+ It does not work for a box in the camera frame.
+ :param q: Quaternion of interest.
+ :return: Yaw angle in radians.
+ """
+
+ # Project into xy plane.
+ v = np.dot(q.rotation_matrix, np.array([1, 0, 0]))
+
+ # Measure yaw using arctan.
+ yaw = np.arctan2(v[1], v[0])
+
+ return yaw
+
+
+
+def cummean(x: np.array) -> np.array:
+ """
+ Computes the cumulative mean up to each position in a NaN sensitive way
+ - If all values are NaN return an array of ones.
+ - If some values are NaN, accumulate arrays discording those entries.
+ """
+ if sum(np.isnan(x)) == len(x):
+ # Is all numbers in array are NaN's.
+ return np.ones(len(x)) # If all errors are NaN set to error to 1 for all operating points.
+ else:
+ # Accumulate in a nan-aware manner.
+ sum_vals = np.nancumsum(x.astype(float)) # Cumulative sum ignoring nans.
+ count_vals = np.cumsum(~np.isnan(x)) # Number of non-nans up to each position.
+ return np.divide(sum_vals, count_vals, out=np.zeros_like(sum_vals), where=count_vals != 0)
+
+
+class DetectionMetricData(abc.ABC):
+ """ This class holds accumulated and interpolated data required to calculate the detection metrics. """
+
+ nelem = 101
+
+ def __init__(self,
+ recall: np.array,
+ precision: np.array,
+ confidence: np.array,
+ trans_err: np.array,
+ vel_err: np.array,
+ scale_err: np.array,
+ orient_err: np.array,
+ attr_err: np.array):
+
+ # Assert lengths.
+ assert len(recall) == self.nelem
+ assert len(precision) == self.nelem
+ assert len(confidence) == self.nelem
+ assert len(trans_err) == self.nelem
+ assert len(vel_err) == self.nelem
+ assert len(scale_err) == self.nelem
+ assert len(orient_err) == self.nelem
+ assert len(attr_err) == self.nelem
+
+ # Assert ordering.
+ assert all(confidence == sorted(confidence, reverse=True)) # Confidences should be descending.
+ assert all(recall == sorted(recall)) # Recalls should be ascending.
+
+ # Set attributes explicitly to help IDEs figure out what is going on.
+ self.recall = recall
+ self.precision = precision
+ self.confidence = confidence
+ self.trans_err = trans_err
+ self.vel_err = vel_err
+ self.scale_err = scale_err
+ self.orient_err = orient_err
+ self.attr_err = attr_err
+
+ def __eq__(self, other):
+ eq = True
+ for key in self.serialize().keys():
+ eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+ return eq
+
+ @property
+ def max_recall_ind(self):
+ """ Returns index of max recall achieved. """
+
+ # Last instance of confidence > 0 is index of max achieved recall.
+ non_zero = np.nonzero(self.confidence)[0]
+ if len(non_zero) == 0: # If there are no matches, all the confidence values will be zero.
+ max_recall_ind = 0
+ else:
+ max_recall_ind = non_zero[-1]
+
+ return max_recall_ind
+
+ @property
+ def max_recall(self):
+ """ Returns max recall achieved. """
+
+ return self.recall[self.max_recall_ind]
+
+ def serialize(self):
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'recall': self.recall.tolist(),
+ 'precision': self.precision.tolist(),
+ 'confidence': self.confidence.tolist(),
+ 'trans_err': self.trans_err.tolist(),
+ 'vel_err': self.vel_err.tolist(),
+ 'scale_err': self.scale_err.tolist(),
+ 'orient_err': self.orient_err.tolist(),
+ 'attr_err': self.attr_err.tolist(),
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(recall=np.array(content['recall']),
+ precision=np.array(content['precision']),
+ confidence=np.array(content['confidence']),
+ trans_err=np.array(content['trans_err']),
+ vel_err=np.array(content['vel_err']),
+ scale_err=np.array(content['scale_err']),
+ orient_err=np.array(content['orient_err']),
+ attr_err=np.array(content['attr_err']))
+
+ @classmethod
+ def no_predictions(cls):
+ """ Returns a md instance corresponding to having no predictions. """
+ return cls(recall=np.linspace(0, 1, cls.nelem),
+ precision=np.zeros(cls.nelem),
+ confidence=np.zeros(cls.nelem),
+ trans_err=np.ones(cls.nelem),
+ vel_err=np.ones(cls.nelem),
+ scale_err=np.ones(cls.nelem),
+ orient_err=np.ones(cls.nelem),
+ attr_err=np.ones(cls.nelem))
+
+ @classmethod
+ def random_md(cls):
+ """ Returns an md instance corresponding to a random results. """
+ return cls(recall=np.linspace(0, 1, cls.nelem),
+ precision=np.random.random(cls.nelem),
+ confidence=np.linspace(0, 1, cls.nelem)[::-1],
+ trans_err=np.random.random(cls.nelem),
+ vel_err=np.random.random(cls.nelem),
+ scale_err=np.random.random(cls.nelem),
+ orient_err=np.random.random(cls.nelem),
+ attr_err=np.random.random(cls.nelem))
+
+
+class DetectionMetricDataList:
+ """ This stores a set of MetricData in a dict indexed by (name, match-distance). """
+
+ def __init__(self):
+ self.md = {}
+
+ def __getitem__(self, key):
+ return self.md[key]
+
+ def __eq__(self, other):
+ eq = True
+ for key in self.md.keys():
+ eq = eq and self[key] == other[key]
+ return eq
+
+ def get_class_data(self, detection_name: str) -> List[Tuple[DetectionMetricData, float]]:
+ """ Get all the MetricData entries for a certain detection_name. """
+ return [(md, dist_th) for (name, dist_th), md in self.md.items() if name == detection_name]
+
+ def get_dist_data(self, dist_th: float) -> List[Tuple[DetectionMetricData, str]]:
+ """ Get all the MetricData entries for a certain match_distance. """
+ return [(md, detection_name) for (detection_name, dist), md in self.md.items() if dist == dist_th]
+
+ def set(self, detection_name: str, match_distance: float, data: DetectionMetricData):
+ """ Sets the MetricData entry for a certain detection_name and match_distance. """
+ self.md[(detection_name, match_distance)] = data
+
+ def serialize(self) -> dict:
+ return {key[0] + ':' + str(key[1]): value.serialize() for key, value in self.md.items()}
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ mdl = cls()
+ for key, md in content.items():
+ name, distance = key.split(':')
+ mdl.set(name, float(distance), DetectionMetricData.deserialize(md))
+ return mdl
+
+class DetectionMetrics:
+ """ Stores average precision and true positive metric results. Provides properties to summarize. """
+
+ def __init__(self, cfg: dict):
+
+ self.cfg = cfg
+ self._label_aps = defaultdict(lambda: defaultdict(float))
+ self._label_tp_errors = defaultdict(lambda: defaultdict(float))
+ self.eval_time = None
+
+ def add_label_ap(self, detection_name: str, dist_th: float, ap: float) -> None:
+ self._label_aps[detection_name][dist_th] = ap
+
+ def get_label_ap(self, detection_name: str, dist_th: float) -> float:
+ return self._label_aps[detection_name][dist_th]
+
+ def add_label_tp(self, detection_name: str, metric_name: str, tp: float):
+ self._label_tp_errors[detection_name][metric_name] = tp
+
+ def get_label_tp(self, detection_name: str, metric_name: str) -> float:
+ return self._label_tp_errors[detection_name][metric_name]
+
+ def add_runtime(self, eval_time: float) -> None:
+ self.eval_time = eval_time
+
+ @property
+ def mean_dist_aps(self) -> Dict[str, float]:
+ """ Calculates the mean over distance thresholds for each label. """
+ return {class_name: np.mean(list(d.values())) for class_name, d in self._label_aps.items()}
+
+ @property
+ def mean_ap(self) -> float:
+ """ Calculates the mean AP by averaging over distance thresholds and classes. """
+ return float(np.mean(list(self.mean_dist_aps.values())))
+
+ @property
+ def tp_errors(self) -> Dict[str, float]:
+ """ Calculates the mean true positive error across all classes for each metric. """
+ errors = {}
+ for metric_name in self.cfg['tp_metrics']:
+ class_errors = []
+ for detection_name in self.cfg['class_names']:
+ class_errors.append(self.get_label_tp(detection_name, metric_name))
+
+ errors[metric_name] = float(np.nanmean(class_errors))
+
+ return errors
+
+ @property
+ def tp_scores(self) -> Dict[str, float]:
+ scores = {}
+ tp_errors = self.tp_errors
+ for metric_name in self.cfg['tp_metrics']:
+
+ # We convert the true positive errors to "scores" by 1-error.
+ score = 1.0 - tp_errors[metric_name]
+
+ # Some of the true positive errors are unbounded, so we bound the scores to min 0.
+ score = max(0.0, score)
+
+ scores[metric_name] = score
+
+ return scores
+
+ @property
+ def nd_score(self) -> float:
+ """
+ Compute the nuScenes detection score (NDS, weighted sum of the individual scores).
+ :return: The NDS.
+ """
+ # Summarize.
+ total = float(self.cfg['mean_ap_weight'] * self.mean_ap + np.sum(list(self.tp_scores.values())))
+
+ # Normalize.
+ total = total / float(self.cfg['mean_ap_weight'] + len(self.tp_scores.keys()))
+
+ return total
+
+
+ def serialize(self):
+ return {
+ 'label_aps': self._label_aps,
+ 'mean_dist_aps': self.mean_dist_aps,
+ 'mean_ap': self.mean_ap,
+ 'label_tp_errors': self._label_tp_errors,
+ 'tp_errors': self.tp_errors,
+ 'tp_scores': self.tp_scores,
+ 'nd_score': self.nd_score,
+ 'eval_time': self.eval_time,
+ 'cfg': self.cfg
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized dictionary. """
+
+ cfg = content['cfg']
+ metrics = cls(cfg=cfg)
+ metrics.add_runtime(content['eval_time'])
+
+ for detection_name, label_aps in content['label_aps'].items():
+ for dist_th, ap in label_aps.items():
+ metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap))
+
+ for detection_name, label_tps in content['label_tp_errors'].items():
+ for metric_name, tp in label_tps.items():
+ metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp))
+
+ return metrics
+
+ def __eq__(self, other):
+ eq = True
+ eq = eq and self._label_aps == other._label_aps
+ eq = eq and self._label_tp_errors == other._label_tp_errors
+ eq = eq and self.eval_time == other.eval_time
+ eq = eq and self.cfg == other.cfg
+
+ return eq
+
+
+class DetectionBox(abc.ABC):
+ """ Data class used during detection evaluation. Can be a prediction or ground truth."""
+
+ def __init__(self,
+ sample_token: str = "",
+ translation: Tuple[float, float, float] = (0, 0, 0),
+ size: Tuple[float, float, float] = (0, 0, 0),
+ rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+ velocity: Tuple[float, float] = (0, 0),
+ ego_translation: Tuple[float, float, float] = (0, 0, 0), # Translation to ego vehicle in meters.
+ num_pts: int = -1, # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+ detection_name: str = 'car', # The class name used in the detection challenge.
+ detection_score: float = -1.0, # GT samples do not have a score.
+ attribute_name: str = ''): # Box attribute. Each box can have at most 1 attribute.
+
+
+ assert detection_name is not None, 'Error: detection_name cannot be empty!'
+ # assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+ # assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+ # 'Error: Unknown attribute_name %s' % attribute_name
+
+ assert type(detection_score) == float, 'Error: detection_score must be a float!'
+ assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+ self.sample_token = sample_token
+ self.translation = translation
+ self.size = size
+ self.rotation = rotation
+ self.velocity = velocity
+ self.ego_translation = ego_translation
+ self.num_pts = num_pts
+ self.detection_name = detection_name
+ self.detection_score = detection_score
+ self.attribute_name = attribute_name
+
+ def __eq__(self, other):
+ return (self.sample_token == other.sample_token and
+ self.translation == other.translation and
+ self.size == other.size and
+ self.rotation == other.rotation and
+ self.velocity == other.velocity and
+ self.ego_translation == other.ego_translation and
+ self.num_pts == other.num_pts and
+ self.detection_name == other.detection_name and
+ self.detection_score == other.detection_score and
+ self.attribute_name == other.attribute_name)
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'])
+ @property
+ def ego_dist(self) -> float:
+ """ Compute the distance from this box to the ego vehicle in 2D. """
+ return np.sqrt(np.sum(np.array(self.ego_translation[:2]) ** 2))
+
+
+
+
+
+class EvalBoxes:
+ """ Data class that groups EvalBox instances by sample. """
+
+ def __init__(self):
+ """
+ Initializes the EvalBoxes for GT or predictions.
+ """
+ self.boxes = defaultdict(list)
+
+ def __repr__(self):
+ return "EvalBoxes with {} boxes across {} samples".format(len(self.all), len(self.sample_tokens))
+
+ def __getitem__(self, item) -> List[DetectionBox]:
+ return self.boxes[item]
+
+ def __eq__(self, other):
+ if not set(self.sample_tokens) == set(other.sample_tokens):
+ return False
+ for token in self.sample_tokens:
+ if not len(self[token]) == len(other[token]):
+ return False
+ for box1, box2 in zip(self[token], other[token]):
+ if box1 != box2:
+ return False
+ return True
+
+ def __len__(self):
+ return len(self.boxes)
+
+ @property
+ def all(self) -> List[DetectionBox]:
+ """ Returns all EvalBoxes in a list. """
+ ab = []
+ for sample_token in self.sample_tokens:
+ ab.extend(self[sample_token])
+ return ab
+
+ @property
+ def sample_tokens(self) -> List[str]:
+ """ Returns a list of all keys. """
+ return list(self.boxes.keys())
+
+ def add_boxes(self, sample_token: str, boxes: List[DetectionBox]) -> None:
+ """ Adds a list of boxes. """
+ self.boxes[sample_token].extend(boxes)
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {key: [box.serialize() for box in boxes] for key, boxes in self.boxes.items()}
+
+ @classmethod
+ def deserialize(cls, content: dict, box_cls):
+ """
+ Initialize from serialized content.
+ :param content: A dictionary with the serialized content of the box.
+ :param box_cls: The class of the boxes, DetectionBox or TrackingBox.
+ """
+ eb = cls()
+ for sample_token, boxes in content.items():
+ eb.add_boxes(sample_token, [box_cls.deserialize(box) for box in boxes])
+ return eb
+
+
+def accumulate(gt_boxes,
+ pred_boxes,
+ class_name: str,
+ dist_fcn: Callable,
+ dist_th: float,
+ verbose: bool = False) -> DetectionMetricData:
+ """
+ Average Precision over predefined different recall thresholds for a single distance threshold.
+ The recall/conf thresholds and other raw metrics will be used in secondary metrics.
+ :param gt_boxes: Maps every sample_token to a list of its sample_annotations.
+ :param pred_boxes: Maps every sample_token to a list of its sample_results.
+ :param class_name: Class to compute AP on.
+ :param dist_fcn: Distance function used to match detections and ground truths.
+ :param dist_th: Distance threshold for a match.
+ :param verbose: If true, print debug messages.
+ :return: (average_prec, metrics). The average precision value and raw data for a number of metrics.
+ """
+ # ---------------------------------------------
+ # Organize input and initialize accumulators.
+ # ---------------------------------------------
+
+ # Count the positives.
+ npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name])
+ if verbose:
+ print("Found {} GT of class {} out of {} total across {} samples.".
+ format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens)))
+
+ # For missing classes in the GT, return a data structure corresponding to no predictions.
+ if npos == 0:
+ return DetectionMetricData.no_predictions()
+
+ # Organize the predictions in a single list.
+ pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name]
+ pred_confs = [box.detection_score for box in pred_boxes_list]
+
+ if verbose:
+ print("Found {} PRED of class {} out of {} total across {} samples.".
+ format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens)))
+
+ # Sort by confidence.
+ sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1]
+
+ # Do the actual matching.
+ tp = [] # Accumulator of true positives.
+ fp = [] # Accumulator of false positives.
+ conf = [] # Accumulator of confidences.
+
+ # match_data holds the extra metrics we calculate for each match.
+ match_data = {'trans_err': [],
+ 'vel_err': [],
+ 'scale_err': [],
+ 'orient_err': [],
+ 'attr_err': [],
+ 'conf': []}
+
+ # ---------------------------------------------
+ # Match and accumulate match data.
+ # ---------------------------------------------
+
+ taken = set() # Initially no gt bounding box is matched.
+ for ind in sortind:
+ pred_box = pred_boxes_list[ind]
+ min_dist = np.inf
+ match_gt_idx = None
+
+ for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]):
+
+ # Find closest match among ground truth boxes
+ if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken:
+ this_distance = dist_fcn(gt_box, pred_box)
+ if this_distance < min_dist:
+ min_dist = this_distance
+ match_gt_idx = gt_idx
+
+ # If the closest match is close enough according to threshold we have a match!
+ is_match = min_dist < dist_th
+
+ if is_match:
+ taken.add((pred_box.sample_token, match_gt_idx))
+
+ # Update tp, fp and confs.
+ tp.append(1)
+ fp.append(0)
+ conf.append(pred_box.detection_score)
+
+ # Since it is a match, update match data also.
+ gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx]
+
+ match_data['trans_err'].append(center_distance(gt_box_match, pred_box))
+ match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box))
+ match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box))
+
+ # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later)
+ period = np.pi if class_name == 'barrier' else 2 * np.pi
+ match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period))
+
+ match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box))
+ match_data['conf'].append(pred_box.detection_score)
+
+ else:
+ # No match. Mark this as a false positive.
+ tp.append(0)
+ fp.append(1)
+ conf.append(pred_box.detection_score)
+
+ # Check if we have any matches. If not, just return a "no predictions" array.
+ if len(match_data['trans_err']) == 0:
+ return DetectionMetricData.no_predictions()
+
+ # ---------------------------------------------
+ # Calculate and interpolate precision and recall
+ # ---------------------------------------------
+
+ # Accumulate.
+ tp = np.cumsum(tp).astype(float)
+ fp = np.cumsum(fp).astype(float)
+ conf = np.array(conf)
+
+ # Calculate precision and recall.
+ prec = tp / (fp + tp)
+ rec = tp / float(npos)
+
+ rec_interp = np.linspace(0, 1, DetectionMetricData.nelem) # 101 steps, from 0% to 100% recall.
+ prec = np.interp(rec_interp, rec, prec, right=0)
+ conf = np.interp(rec_interp, rec, conf, right=0)
+ rec = rec_interp
+
+ # ---------------------------------------------
+ # Re-sample the match-data to match, prec, recall and conf.
+ # ---------------------------------------------
+
+ for key in match_data.keys():
+ if key == "conf":
+ continue # Confidence is used as reference to align with fp and tp. So skip in this step.
+
+ else:
+ # For each match_data, we first calculate the accumulated mean.
+ tmp = cummean(np.array(match_data[key]))
+
+ # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays)
+ match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1]
+
+ # ---------------------------------------------
+ # Done. Instantiate MetricData and return
+ # ---------------------------------------------
+ return DetectionMetricData(recall=rec,
+ precision=prec,
+ confidence=conf,
+ trans_err=match_data['trans_err'],
+ vel_err=match_data['vel_err'],
+ scale_err=match_data['scale_err'],
+ orient_err=match_data['orient_err'],
+ attr_err=match_data['attr_err'])
+
+
+
+def calc_ap(md: DetectionMetricData, min_recall: float, min_precision: float) -> float:
+ """ Calculated average precision. """
+
+ assert 0 <= min_precision < 1
+ assert 0 <= min_recall <= 1
+
+ prec = np.copy(md.precision)
+ prec = prec[round(100 * min_recall) + 1:] # Clip low recalls. +1 to exclude the min recall bin.
+ prec -= min_precision # Clip low precision
+ prec[prec < 0] = 0
+ return float(np.mean(prec)) / (1.0 - min_precision)
+
+
+def calc_tp(md: DetectionMetricData, min_recall: float, metric_name: str) -> float:
+ """ Calculates true positive errors. """
+
+ first_ind = round(100 * min_recall) + 1 # +1 to exclude the error at min recall.
+ last_ind = md.max_recall_ind # First instance of confidence = 0 is index of max achieved recall.
+ if last_ind < first_ind:
+ return 1.0 # Assign 1 here. If this happens for all classes, the score for that TP metric will be 0.
+ else:
+ return float(np.mean(getattr(md, metric_name)[first_ind: last_ind + 1])) # +1 to include error at max recall.
+
+
+def quaternion_yaw(q: Quaternion) -> float:
+ """
+ Calculate the yaw angle from a quaternion.
+ Note that this only works for a quaternion that represents a box in lidar or global coordinate frame.
+ It does not work for a box in the camera frame.
+ :param q: Quaternion of interest.
+ :return: Yaw angle in radians.
+ """
+
+ # Project into xy plane.
+ v = np.dot(q.rotation_matrix, np.array([1, 0, 0]))
+
+ # Measure yaw using arctan.
+ yaw = np.arctan2(v[1], v[0])
+
+ return yaw
\ No newline at end of file
diff --git a/mmcv/datasets/nuscenes_vad_dataset.py b/mmcv/datasets/nuscenes_vad_dataset.py
new file mode 100644
index 0000000..a552afb
--- /dev/null
+++ b/mmcv/datasets/nuscenes_vad_dataset.py
@@ -0,0 +1,1933 @@
+import os
+import json
+import copy
+import tempfile
+from typing import Dict, List
+from mmcv.fileio.io import dump,load
+import numpy as np
+from .builder import DATASETS
+from mmcv.datasets import NuScenesDataset
+import pyquaternion
+import mmcv
+from os import path as osp
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .vad_custom_nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.common.utils import center_distance
+from mmcv.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from mmcv.core.bbox.structures.nuscenes_box import CustomNuscenesBox
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+from mmcv.datasets.pipelines import to_tensor
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.detection.constants import DETECTION_NAMES
+
+
+class LiDARInstanceLines(object):
+ """Line instance in LIDAR coordinates
+
+ """
+ def __init__(self,
+ instance_line_list,
+ sample_dist=1,
+ num_samples=250,
+ padding=False,
+ fixed_num=-1,
+ padding_value=-10000,
+ patch_size=None):
+ assert isinstance(instance_line_list, list)
+ assert patch_size is not None
+ if len(instance_line_list) != 0:
+ assert isinstance(instance_line_list[0], LineString)
+ self.patch_size = patch_size
+ self.max_x = self.patch_size[1] / 2
+ self.max_y = self.patch_size[0] / 2
+ self.sample_dist = sample_dist
+ self.num_samples = num_samples
+ self.padding = padding
+ self.fixed_num = fixed_num
+ self.padding_value = padding_value
+
+ self.instance_list = instance_line_list
+
+ @property
+ def start_end_points(self):
+ """
+ return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+ """
+ assert len(self.instance_list) != 0
+ instance_se_points_list = []
+ for instance in self.instance_list:
+ se_points = []
+ se_points.extend(instance.coords[0])
+ se_points.extend(instance.coords[-1])
+ instance_se_points_list.append(se_points)
+ instance_se_points_array = np.array(instance_se_points_list)
+ instance_se_points_tensor = to_tensor(instance_se_points_array)
+ instance_se_points_tensor = instance_se_points_tensor.to(
+ dtype=torch.float32)
+ instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+ instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+ instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+ instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+ return instance_se_points_tensor
+
+ @property
+ def bbox(self):
+ """
+ return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+ """
+ assert len(self.instance_list) != 0
+ instance_bbox_list = []
+ for instance in self.instance_list:
+ # bounds is bbox: [xmin, ymin, xmax, ymax]
+ instance_bbox_list.append(instance.bounds)
+ instance_bbox_array = np.array(instance_bbox_list)
+ instance_bbox_tensor = to_tensor(instance_bbox_array)
+ instance_bbox_tensor = instance_bbox_tensor.to(
+ dtype=torch.float32)
+ instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+ instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+ instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+ instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+ return instance_bbox_tensor
+
+ @property
+ def fixed_num_sampled_points(self):
+ """
+ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+ N means the num of instances
+ """
+ assert len(self.instance_list) != 0
+ instance_points_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ instance_points_list.append(sampled_points)
+ instance_points_array = np.array(instance_points_list)
+ instance_points_tensor = to_tensor(instance_points_array)
+ instance_points_tensor = instance_points_tensor.to(
+ dtype=torch.float32)
+ instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ return instance_points_tensor
+
+ @property
+ def fixed_num_sampled_points_ambiguity(self):
+ """
+ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+ N means the num of instances
+ """
+ assert len(self.instance_list) != 0
+ instance_points_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ instance_points_list.append(sampled_points)
+ instance_points_array = np.array(instance_points_list)
+ instance_points_tensor = to_tensor(instance_points_array)
+ instance_points_tensor = instance_points_tensor.to(
+ dtype=torch.float32)
+ instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ instance_points_tensor = instance_points_tensor.unsqueeze(1)
+ return instance_points_tensor
+
+ @property
+ def fixed_num_sampled_points_torch(self):
+ """
+ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+ N means the num of instances
+ """
+ assert len(self.instance_list) != 0
+ instance_points_list = []
+ for instance in self.instance_list:
+ # distances = np.linspace(0, instance.length, self.fixed_num)
+ # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ poly_pts = to_tensor(np.array(list(instance.coords)))
+ poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+ sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+ sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+ instance_points_list.append(sampled_pts)
+ # instance_points_array = np.array(instance_points_list)
+ # instance_points_tensor = to_tensor(instance_points_array)
+ instance_points_tensor = torch.stack(instance_points_list,dim=0)
+ instance_points_tensor = instance_points_tensor.to(
+ dtype=torch.float32)
+ instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ return instance_points_tensor
+
+ @property
+ def shift_fixed_num_sampled_points(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ fixed_num = fixed_num_pts.shape[0]
+ shift_pts_list = []
+ if is_poly:
+ # import pdb;pdb.set_trace()
+ for shift_right_i in range(fixed_num):
+ shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v1(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ pts_num = fixed_num_pts.shape[0]
+ shift_num = pts_num - 1
+ if is_poly:
+ pts_to_shift = fixed_num_pts[:-1,:]
+ shift_pts_list = []
+ if is_poly:
+ for shift_right_i in range(shift_num):
+ shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ if is_poly:
+ _, _, num_coords = shift_pts.shape
+ tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+ tmp_shift_pts[:,:-1,:] = shift_pts
+ tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+ shift_pts = tmp_shift_pts
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v2(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ assert len(self.instance_list) != 0
+ instances_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ poly_pts = np.array(list(instance.coords))
+ start_pts = poly_pts[0]
+ end_pts = poly_pts[-1]
+ is_poly = np.equal(start_pts, end_pts)
+ is_poly = is_poly.all()
+ shift_pts_list = []
+ pts_num, coords_num = poly_pts.shape
+ shift_num = pts_num - 1
+ final_shift_num = self.fixed_num - 1
+ if is_poly:
+ pts_to_shift = poly_pts[:-1,:]
+ for shift_right_i in range(shift_num):
+ shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+ pts_to_concat = shift_pts[0]
+ pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+ shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+ shift_instance = LineString(shift_pts)
+ shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ shift_pts_list.append(shift_sampled_points)
+ # import pdb;pdb.set_trace()
+ else:
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ flip_sampled_points = np.flip(sampled_points, axis=0)
+ shift_pts_list.append(sampled_points)
+ shift_pts_list.append(flip_sampled_points)
+
+ multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+ shifts_num,_,_ = multi_shifts_pts.shape
+
+ if shifts_num > final_shift_num:
+ index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+ multi_shifts_pts = multi_shifts_pts[index]
+
+ multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+ multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+ dtype=torch.float32)
+
+ multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ # if not is_poly:
+ if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+ padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+ multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+ instances_list.append(multi_shifts_pts_tensor)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v3(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ assert len(self.instance_list) != 0
+ instances_list = []
+ for instance in self.instance_list:
+ distances = np.linspace(0, instance.length, self.fixed_num)
+ poly_pts = np.array(list(instance.coords))
+ start_pts = poly_pts[0]
+ end_pts = poly_pts[-1]
+ is_poly = np.equal(start_pts, end_pts)
+ is_poly = is_poly.all()
+ shift_pts_list = []
+ pts_num, coords_num = poly_pts.shape
+ shift_num = pts_num - 1
+ final_shift_num = self.fixed_num - 1
+ if is_poly:
+ pts_to_shift = poly_pts[:-1,:]
+ for shift_right_i in range(shift_num):
+ shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+ pts_to_concat = shift_pts[0]
+ pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+ shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+ shift_instance = LineString(shift_pts)
+ shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ shift_pts_list.append(shift_sampled_points)
+ flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+ for shift_right_i in range(shift_num):
+ shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+ pts_to_concat = shift_pts[0]
+ pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+ shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+ shift_instance = LineString(shift_pts)
+ shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ shift_pts_list.append(shift_sampled_points)
+ # import pdb;pdb.set_trace()
+ else:
+ sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ flip_sampled_points = np.flip(sampled_points, axis=0)
+ shift_pts_list.append(sampled_points)
+ shift_pts_list.append(flip_sampled_points)
+
+ multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+ shifts_num,_,_ = multi_shifts_pts.shape
+ # import pdb;pdb.set_trace()
+ if shifts_num > 2*final_shift_num:
+ index = np.random.choice(shift_num, final_shift_num, replace=False)
+ flip0_shifts_pts = multi_shifts_pts[index]
+ flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+ multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+
+ multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+ multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+ dtype=torch.float32)
+
+ multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+ multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+ # if not is_poly:
+ if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+ padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+ multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+ instances_list.append(multi_shifts_pts_tensor)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_v4(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ pts_num = fixed_num_pts.shape[0]
+ shift_num = pts_num - 1
+ shift_pts_list = []
+ if is_poly:
+ pts_to_shift = fixed_num_pts[:-1,:]
+ for shift_right_i in range(shift_num):
+ shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+ flip_pts_to_shift = pts_to_shift.flip(0)
+ for shift_right_i in range(shift_num):
+ shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ if is_poly:
+ _, _, num_coords = shift_pts.shape
+ tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+ tmp_shift_pts[:,:-1,:] = shift_pts
+ tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+ shift_pts = tmp_shift_pts
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ @property
+ def shift_fixed_num_sampled_points_torch(self):
+ """
+ return [instances_num, num_shifts, fixed_num, 2]
+ """
+ fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+ instances_list = []
+ is_poly = False
+ # is_line = False
+ # import pdb;pdb.set_trace()
+ for fixed_num_pts in fixed_num_sampled_points:
+ # [fixed_num, 2]
+ is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+ fixed_num = fixed_num_pts.shape[0]
+ shift_pts_list = []
+ if is_poly:
+ # import pdb;pdb.set_trace()
+ for shift_right_i in range(fixed_num):
+ shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+ else:
+ shift_pts_list.append(fixed_num_pts)
+ shift_pts_list.append(fixed_num_pts.flip(0))
+ shift_pts = torch.stack(shift_pts_list,dim=0)
+
+ shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+ shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+ if not is_poly:
+ padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+ shift_pts = torch.cat([shift_pts,padding],dim=0)
+ # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ instances_list.append(shift_pts)
+ instances_tensor = torch.stack(instances_list, dim=0)
+ instances_tensor = instances_tensor.to(
+ dtype=torch.float32)
+ return instances_tensor
+
+ # @property
+ # def polyline_points(self):
+ # """
+ # return [[x0,y0],[x1,y1],...]
+ # """
+ # assert len(self.instance_list) != 0
+ # for instance in self.instance_list:
+
+
+class VectorizedLocalMap(object):
+ CLASS2LABEL = {
+ 'road_divider': 0,
+ 'lane_divider': 0,
+ 'ped_crossing': 1,
+ 'contours': 2,
+ 'others': -1
+ }
+ def __init__(self,
+ dataroot,
+ patch_size,
+ map_classes=['divider','ped_crossing','boundary'],
+ line_classes=['road_divider', 'lane_divider'],
+ ped_crossing_classes=['ped_crossing'],
+ contour_classes=['road_segment', 'lane'],
+ sample_dist=1,
+ num_samples=250,
+ padding=False,
+ fixed_ptsnum_per_line=-1,
+ padding_value=-10000,):
+ '''
+ Args:
+ fixed_ptsnum_per_line = -1 : no fixed num
+ '''
+ super().__init__()
+ self.data_root = dataroot
+ self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
+ 'singapore-onenorth', 'singapore-queenstown']
+ self.vec_classes = map_classes
+ self.line_classes = line_classes
+ self.ped_crossing_classes = ped_crossing_classes
+ self.polygon_classes = contour_classes
+ self.nusc_maps = {}
+ self.map_explorer = {}
+ for loc in self.MAPS:
+ self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc)
+ self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc])
+
+ self.patch_size = patch_size
+ self.sample_dist = sample_dist
+ self.num_samples = num_samples
+ self.padding = padding
+ self.fixed_num = fixed_ptsnum_per_line
+ self.padding_value = padding_value
+
+ def gen_vectorized_samples(self, location, lidar2global_translation, lidar2global_rotation):
+ '''
+ use lidar2global to get gt map layers
+ '''
+
+ map_pose = lidar2global_translation[:2]
+ rotation = Quaternion(lidar2global_rotation)
+
+ patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+ patch_angle = quaternion_yaw(rotation) / np.pi * 180
+ # import pdb;pdb.set_trace()
+ vectors = []
+ for vec_class in self.vec_classes:
+ if vec_class == 'divider':
+ line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location)
+ line_instances_dict = self.line_geoms_to_instances(line_geom)
+ for line_type, instances in line_instances_dict.items():
+ for instance in instances:
+ vectors.append((instance, self.CLASS2LABEL.get(line_type, -1)))
+ elif vec_class == 'ped_crossing':
+ ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location)
+ # ped_vector_list = self.ped_geoms_to_vectors(ped_geom)
+ ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom)
+ # import pdb;pdb.set_trace()
+ for instance in ped_instance_list:
+ vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1)))
+ elif vec_class == 'boundary':
+ polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location)
+ # import pdb;pdb.set_trace()
+ poly_bound_list = self.poly_geoms_to_instances(polygon_geom)
+ # import pdb;pdb.set_trace()
+ for contour in poly_bound_list:
+ vectors.append((contour, self.CLASS2LABEL.get('contours', -1)))
+ else:
+ raise ValueError(f'WRONG vec_class: {vec_class}')
+
+ # filter out -1
+ filtered_vectors = []
+ gt_pts_loc_3d = []
+ gt_pts_num_3d = []
+ gt_labels = []
+ gt_instance = []
+ for instance, type in vectors:
+ if type != -1:
+ gt_instance.append(instance)
+ gt_labels.append(type)
+
+ gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist,
+ self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size)
+
+ anns_results = dict(
+ gt_vecs_pts_loc=gt_instance,
+ gt_vecs_label=gt_labels,
+
+ )
+ # import pdb;pdb.set_trace()
+ return anns_results
+
+ def get_map_geom(self, patch_box, patch_angle, layer_names, location):
+ map_geom = []
+ for layer_name in layer_names:
+ if layer_name in self.line_classes:
+ # import pdb;pdb.set_trace()
+ geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location)
+ # import pdb;pdb.set_trace()
+ # geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name)
+ map_geom.append((layer_name, geoms))
+ elif layer_name in self.polygon_classes:
+ geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location)
+ # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+ map_geom.append((layer_name, geoms))
+ elif layer_name in self.ped_crossing_classes:
+ geoms = self.get_ped_crossing_line(patch_box, patch_angle, location)
+ # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+ map_geom.append((layer_name, geoms))
+ return map_geom
+
+ def _one_type_line_geom_to_vectors(self, line_geom):
+ line_vectors = []
+
+ for line in line_geom:
+ if not line.is_empty:
+ if line.geom_type == 'MultiLineString':
+ for single_line in line.geoms:
+ line_vectors.append(self.sample_pts_from_line(single_line))
+ elif line.geom_type == 'LineString':
+ line_vectors.append(self.sample_pts_from_line(line))
+ else:
+ raise NotImplementedError
+ return line_vectors
+
+ def _one_type_line_geom_to_instances(self, line_geom):
+ line_instances = []
+
+ for line in line_geom:
+ if not line.is_empty:
+ if line.geom_type == 'MultiLineString':
+ for single_line in line.geoms:
+ line_instances.append(single_line)
+ elif line.geom_type == 'LineString':
+ line_instances.append(line)
+ else:
+ raise NotImplementedError
+ return line_instances
+
+ def poly_geoms_to_vectors(self, polygon_geom):
+ roads = polygon_geom[0][1]
+ lanes = polygon_geom[1][1]
+ union_roads = ops.unary_union(roads)
+ union_lanes = ops.unary_union(lanes)
+ union_segments = ops.unary_union([union_roads, union_lanes])
+ max_x = self.patch_size[1] / 2
+ max_y = self.patch_size[0] / 2
+ local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+ exteriors = []
+ interiors = []
+ if union_segments.geom_type != 'MultiPolygon':
+ union_segments = MultiPolygon([union_segments])
+ for poly in union_segments.geoms:
+ exteriors.append(poly.exterior)
+ for inter in poly.interiors:
+ interiors.append(inter)
+
+ results = []
+ for ext in exteriors:
+ if ext.is_ccw:
+ ext.coords = list(ext.coords)[::-1]
+ lines = ext.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ for inter in interiors:
+ if not inter.is_ccw:
+ inter.coords = list(inter.coords)[::-1]
+ lines = inter.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ return self._one_type_line_geom_to_vectors(results)
+
+ def ped_poly_geoms_to_instances(self, ped_geom):
+ # import pdb;pdb.set_trace()
+ ped = ped_geom[0][1]
+ union_segments = ops.unary_union(ped)
+ max_x = self.patch_size[1] / 2
+ max_y = self.patch_size[0] / 2
+ # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+ local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2)
+ exteriors = []
+ interiors = []
+ if union_segments.geom_type != 'MultiPolygon':
+ union_segments = MultiPolygon([union_segments])
+ for poly in union_segments.geoms:
+ exteriors.append(poly.exterior)
+ for inter in poly.interiors:
+ interiors.append(inter)
+
+ results = []
+ for ext in exteriors:
+ if ext.is_ccw:
+ ext.coords = list(ext.coords)[::-1]
+ lines = ext.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ for inter in interiors:
+ if not inter.is_ccw:
+ inter.coords = list(inter.coords)[::-1]
+ lines = inter.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ return self._one_type_line_geom_to_instances(results)
+
+
+ def poly_geoms_to_instances(self, polygon_geom):
+ roads = polygon_geom[0][1]
+ lanes = polygon_geom[1][1]
+ union_roads = ops.unary_union(roads)
+ union_lanes = ops.unary_union(lanes)
+ union_segments = ops.unary_union([union_roads, union_lanes])
+ max_x = self.patch_size[1] / 2
+ max_y = self.patch_size[0] / 2
+ local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+ exteriors = []
+ interiors = []
+ if union_segments.geom_type != 'MultiPolygon':
+ union_segments = MultiPolygon([union_segments])
+ for poly in union_segments.geoms:
+ exteriors.append(poly.exterior)
+ for inter in poly.interiors:
+ interiors.append(inter)
+
+ results = []
+ for ext in exteriors:
+ if ext.is_ccw:
+ ext.coords = list(ext.coords)[::-1]
+ lines = ext.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ for inter in interiors:
+ if not inter.is_ccw:
+ inter.coords = list(inter.coords)[::-1]
+ lines = inter.intersection(local_patch)
+ if isinstance(lines, MultiLineString):
+ lines = ops.linemerge(lines)
+ results.append(lines)
+
+ return self._one_type_line_geom_to_instances(results)
+
+ def line_geoms_to_vectors(self, line_geom):
+ line_vectors_dict = dict()
+ for line_type, a_type_of_lines in line_geom:
+ one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines)
+ line_vectors_dict[line_type] = one_type_vectors
+
+ return line_vectors_dict
+ def line_geoms_to_instances(self, line_geom):
+ line_instances_dict = dict()
+ for line_type, a_type_of_lines in line_geom:
+ one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines)
+ line_instances_dict[line_type] = one_type_instances
+
+ return line_instances_dict
+
+ def ped_geoms_to_vectors(self, ped_geom):
+ ped_geom = ped_geom[0][1]
+ union_ped = ops.unary_union(ped_geom)
+ if union_ped.geom_type != 'MultiPolygon':
+ union_ped = MultiPolygon([union_ped])
+
+ max_x = self.patch_size[1] / 2
+ max_y = self.patch_size[0] / 2
+ local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+ results = []
+ for ped_poly in union_ped:
+ # rect = ped_poly.minimum_rotated_rectangle
+ ext = ped_poly.exterior
+ if not ext.is_ccw:
+ ext.coords = list(ext.coords)[::-1]
+ lines = ext.intersection(local_patch)
+ results.append(lines)
+
+ return self._one_type_line_geom_to_vectors(results)
+
+ def get_contour_line(self,patch_box,patch_angle,layer_name,location):
+ if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers:
+ raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+ patch_x = patch_box[0]
+ patch_y = patch_box[1]
+
+ patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+ records = getattr(self.map_explorer[location].map_api, layer_name)
+
+ polygon_list = []
+ if layer_name == 'drivable_area':
+ for record in records:
+ polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+ for polygon in polygons:
+ new_polygon = polygon.intersection(patch)
+ if not new_polygon.is_empty:
+ new_polygon = affinity.rotate(new_polygon, -patch_angle,
+ origin=(patch_x, patch_y), use_radians=False)
+ new_polygon = affinity.affine_transform(new_polygon,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ if new_polygon.geom_type == 'Polygon':
+ new_polygon = MultiPolygon([new_polygon])
+ polygon_list.append(new_polygon)
+
+ else:
+ for record in records:
+ polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+
+ if polygon.is_valid:
+ new_polygon = polygon.intersection(patch)
+ if not new_polygon.is_empty:
+ new_polygon = affinity.rotate(new_polygon, -patch_angle,
+ origin=(patch_x, patch_y), use_radians=False)
+ new_polygon = affinity.affine_transform(new_polygon,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ if new_polygon.geom_type == 'Polygon':
+ new_polygon = MultiPolygon([new_polygon])
+ polygon_list.append(new_polygon)
+
+ return polygon_list
+
+ def get_divider_line(self,patch_box,patch_angle,layer_name,location):
+ if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers:
+ raise ValueError("{} is not a line layer".format(layer_name))
+
+ if layer_name == 'traffic_light':
+ return None
+
+ patch_x = patch_box[0]
+ patch_y = patch_box[1]
+
+ patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+ line_list = []
+ records = getattr(self.map_explorer[location].map_api, layer_name)
+ for record in records:
+ line = self.map_explorer[location].map_api.extract_line(record['line_token'])
+ if line.is_empty: # Skip lines without nodes.
+ continue
+
+ new_line = line.intersection(patch)
+ if not new_line.is_empty:
+ new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
+ new_line = affinity.affine_transform(new_line,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ line_list.append(new_line)
+
+ return line_list
+
+ def get_ped_crossing_line(self, patch_box, patch_angle, location):
+ patch_x = patch_box[0]
+ patch_y = patch_box[1]
+
+ patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+ polygon_list = []
+ records = getattr(self.map_explorer[location].map_api, 'ped_crossing')
+ # records = getattr(self.nusc_maps[location], 'ped_crossing')
+ for record in records:
+ polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+ if polygon.is_valid:
+ new_polygon = polygon.intersection(patch)
+ if not new_polygon.is_empty:
+ new_polygon = affinity.rotate(new_polygon, -patch_angle,
+ origin=(patch_x, patch_y), use_radians=False)
+ new_polygon = affinity.affine_transform(new_polygon,
+ [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+ if new_polygon.geom_type == 'Polygon':
+ new_polygon = MultiPolygon([new_polygon])
+ polygon_list.append(new_polygon)
+
+ return polygon_list
+
+ def sample_pts_from_line(self, line):
+ if self.fixed_num < 0:
+ distances = np.arange(0, line.length, self.sample_dist)
+ sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+ else:
+ # fixed number of points, so distance is line.length / self.fixed_num
+ distances = np.linspace(0, line.length, self.fixed_num)
+ sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+ # tmpdistances = np.linspace(0, line.length, 2)
+ # tmpsampled_points = np.array([list(line.interpolate(tmpdistance).coords) for tmpdistance in tmpdistances]).reshape(-1, 2)
+ # import pdb;pdb.set_trace()
+ # if self.normalize:
+ # sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+
+ num_valid = len(sampled_points)
+
+ if not self.padding or self.fixed_num > 0:
+ # fixed num sample can return now!
+ return sampled_points, num_valid
+
+ # fixed distance sampling need padding!
+ num_valid = len(sampled_points)
+
+ if self.fixed_num < 0:
+ if num_valid < self.num_samples:
+ padding = np.zeros((self.num_samples - len(sampled_points), 2))
+ sampled_points = np.concatenate([sampled_points, padding], axis=0)
+ else:
+ sampled_points = sampled_points[:self.num_samples, :]
+ num_valid = self.num_samples
+
+ # if self.normalize:
+ # sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+ # num_valid = len(sampled_points)
+
+ return sampled_points, num_valid
+
+
+###############################################################################################################
+###############################################################################################################
+###############################################################################################################
+
+class v1CustomDetectionConfig:
+ """ Data class that specifies the detection evaluation settings. """
+
+ def __init__(self,
+ class_range_x: Dict[str, int],
+ class_range_y: Dict[str, int],
+ dist_fcn: str,
+ dist_ths: List[float],
+ dist_th_tp: float,
+ min_recall: float,
+ min_precision: float,
+ max_boxes_per_sample: int,
+ mean_ap_weight: int):
+
+ assert set(class_range_x.keys()) == set(DETECTION_NAMES), "Class count mismatch."
+ assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths."
+
+ self.class_range_x = class_range_x
+ self.class_range_y = class_range_y
+ self.dist_fcn = dist_fcn
+ self.dist_ths = dist_ths
+ self.dist_th_tp = dist_th_tp
+ self.min_recall = min_recall
+ self.min_precision = min_precision
+ self.max_boxes_per_sample = max_boxes_per_sample
+ self.mean_ap_weight = mean_ap_weight
+
+ self.class_names = self.class_range_y.keys()
+
+ def __eq__(self, other):
+ eq = True
+ for key in self.serialize().keys():
+ eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+ return eq
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'class_range_x': self.class_range_x,
+ 'class_range_y': self.class_range_y,
+ 'dist_fcn': self.dist_fcn,
+ 'dist_ths': self.dist_ths,
+ 'dist_th_tp': self.dist_th_tp,
+ 'min_recall': self.min_recall,
+ 'min_precision': self.min_precision,
+ 'max_boxes_per_sample': self.max_boxes_per_sample,
+ 'mean_ap_weight': self.mean_ap_weight
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized dictionary. """
+ return cls(content['class_range_x'],
+ content['class_range_y'],
+ content['dist_fcn'],
+ content['dist_ths'],
+ content['dist_th_tp'],
+ content['min_recall'],
+ content['min_precision'],
+ content['max_boxes_per_sample'],
+ content['mean_ap_weight'])
+
+ @property
+ def dist_fcn_callable(self):
+ """ Return the distance function corresponding to the dist_fcn string. """
+ if self.dist_fcn == 'center_distance':
+ return center_distance
+ else:
+ raise Exception('Error: Unknown distance function %s!' % self.dist_fcn)
+
+@DATASETS.register_module()
+class VADCustomNuScenesDataset(NuScenesDataset):
+ r"""Custom NuScenes Dataset.
+ """
+ MAPCLASSES = ('divider',)
+ def __init__(
+ self,
+ queue_length=4,
+ bev_size=(200, 200),
+ overlap_test=False,
+ with_attr=True,
+ fut_ts=6,
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+ map_classes=None,
+ map_ann_file=None,
+ map_fixed_ptsnum_per_line=-1,
+ map_eval_use_same_gt_sample_num_flag=False,
+ padding_value=-10000,
+ use_pkl_result=False,
+ custom_eval_version='vad_nusc_detection_cvpr_2019',
+ *args,
+ **kwargs
+ ):
+ super().__init__(*args, **kwargs)
+ self.queue_length = queue_length
+ self.overlap_test = overlap_test
+ self.bev_size = bev_size
+ self.with_attr = with_attr
+ self.fut_ts = fut_ts
+ self.use_pkl_result = use_pkl_result
+
+ self.custom_eval_version = custom_eval_version
+ # Check if config exists.
+ this_dir = os.path.dirname(os.path.abspath(__file__))
+ cfg_path = os.path.join(this_dir, '%s.json' % self.custom_eval_version)
+ assert os.path.exists(cfg_path), \
+ 'Requested unknown configuration {}'.format(self.custom_eval_version)
+ # Load config file and deserialize it.
+ with open(cfg_path, 'r') as f:
+ data = json.load(f)
+ self.custom_eval_detection_configs = v1CustomDetectionConfig.deserialize(data)
+
+ self.map_ann_file = map_ann_file
+ self.MAPCLASSES = self.get_map_classes(map_classes)
+ self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+ self.pc_range = pc_range
+ patch_h = pc_range[4]-pc_range[1]
+ patch_w = pc_range[3]-pc_range[0]
+ self.patch_size = (patch_h, patch_w)
+ self.padding_value = padding_value
+ self.fixed_num = map_fixed_ptsnum_per_line
+ self.eval_use_same_gt_sample_num_flag = map_eval_use_same_gt_sample_num_flag
+ self.vector_map = VectorizedLocalMap(kwargs['data_root'],
+ patch_size=self.patch_size, map_classes=self.MAPCLASSES,
+ fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+ padding_value=self.padding_value)
+ self.is_vis_on_test = True
+
+ @classmethod
+ def get_map_classes(cls, map_classes=None):
+ """Get class names of current dataset.
+
+ Args:
+ classes (Sequence[str] | str | None): If classes is None, use
+ default CLASSES defined by builtin dataset. If classes is a
+ string, take it as a file name. The file contains the name of
+ classes where each line contains one class name. If classes is
+ a tuple or list, override the CLASSES defined by the dataset.
+
+ Return:
+ list[str]: A list of class names.
+ """
+ if map_classes is None:
+ return cls.MAPCLASSES
+
+ if isinstance(map_classes, str):
+ # take it as a file path
+ class_names = mmcv.list_from_file(map_classes)
+ elif isinstance(map_classes, (tuple, list)):
+ class_names = map_classes
+ else:
+ raise ValueError(f'Unsupported type {type(map_classes)} of map classes.')
+
+ return class_names
+
+ def vectormap_pipeline(self, example, input_dict):
+ '''
+ `example` type:
+ keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img';
+ all keys type is 'DataContainer';
+ 'img_metas' cpu_only=True, type is dict, others are false;
+ 'gt_labels_3d' shape torch.size([num_samples]), stack=False,
+ padding_value=0, cpu_only=False
+ 'gt_bboxes_3d': stack=False, cpu_only=True
+ '''
+ # import pdb;pdb.set_trace()
+ lidar2ego = np.eye(4)
+ lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+ lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+ ego2global = np.eye(4)
+ ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+ ego2global[:3, 3] = input_dict['ego2global_translation']
+
+ lidar2global = ego2global @ lidar2ego
+
+ lidar2global_translation = list(lidar2global[:3,3])
+ lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+
+ location = input_dict['map_location']
+ ego2global_translation = input_dict['ego2global_translation']
+ ego2global_rotation = input_dict['ego2global_rotation']
+ anns_results = self.vector_map.gen_vectorized_samples(
+ location, lidar2global_translation, lidar2global_rotation
+ )
+
+ '''
+ anns_results, type: dict
+ 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+ 'gt_vecs_pts_num': list[num_vecs], vec with num_points
+ 'gt_vecs_label': list[num_vecs], vec with cls index
+ '''
+ gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+ if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+ gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+ else:
+ gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+ try:
+ gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+ except:
+ # empty tensor, will be passed in train,
+ # but we preserve it for test
+ gt_vecs_pts_loc = gt_vecs_pts_loc
+
+ example['map_gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False)
+ example['map_gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True)
+
+ return example
+
+ def prepare_train_data(self, index):
+ """
+ Training data preparation.
+ Args:
+ index (int): Index for accessing the target data.
+ Returns:
+ dict: Training data dict of the corresponding index.
+ """
+ data_queue = []
+
+ # temporal aug
+ prev_indexs_list = list(range(index-self.queue_length, index))
+ random.shuffle(prev_indexs_list)
+ prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+ ##
+
+ input_dict = self.get_data_info(index)
+ if input_dict is None:
+ return None
+ frame_idx = input_dict['frame_idx']
+ scene_token = input_dict['scene_token']
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ example = self.vectormap_pipeline(example,input_dict)
+ if self.filter_empty_gt and \
+ ((example is None or ~(example['gt_labels_3d']._data != -1).any()) or \
+ (example is None or ~(example['map_gt_labels_3d']._data != -1).any())):
+ return None
+ data_queue.insert(0, example)
+ for i in prev_indexs_list:
+ i = max(0, i)
+ input_dict = self.get_data_info(i)
+ if input_dict is None:
+ return None
+ if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token:
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ example = self.vectormap_pipeline(example,input_dict)
+ if self.filter_empty_gt and \
+ (example is None or ~(example['gt_labels_3d']._data != -1).any()) and \
+ (example is None or ~(example['map_gt_labels_3d']._data != -1).any()):
+ return None
+ frame_idx = input_dict['frame_idx']
+ data_queue.insert(0, copy.deepcopy(example))
+ return self.union2one(data_queue)
+
+ def prepare_test_data(self, index):
+ """Prepare data for testing.
+
+ Args:
+ index (int): Index for accessing the target data.
+
+ Returns:
+ dict: Testing data dict of the corresponding index.
+ """
+ input_dict = self.get_data_info(index)
+ self.pre_pipeline(input_dict)
+ example = self.pipeline(input_dict)
+ if self.is_vis_on_test:
+ example = self.vectormap_pipeline(example, input_dict)
+ return example
+
+ def union2one(self, queue):
+ """
+ convert sample queue into one single sample.
+ """
+ imgs_list = [each['img'].data for each in queue]
+ metas_map = {}
+ prev_pos = None
+ prev_angle = None
+ for i, each in enumerate(queue):
+ metas_map[i] = each['img_metas'].data
+ if i == 0:
+ metas_map[i]['prev_bev'] = False
+ prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] = 0
+ metas_map[i]['can_bus'][-1] = 0
+ else:
+ metas_map[i]['prev_bev'] = True
+ tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+ tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+ metas_map[i]['can_bus'][:3] -= prev_pos
+ metas_map[i]['can_bus'][-1] -= prev_angle
+ prev_pos = copy.deepcopy(tmp_pos)
+ prev_angle = copy.deepcopy(tmp_angle)
+
+ queue[-1]['img'] = DC(torch.stack(imgs_list),
+ cpu_only=False, stack=True)
+ queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+ queue = queue[-1]
+ return queue
+
+ def get_ann_info(self, index):
+ """Get annotation info according to the given index.
+
+ Args:
+ index (int): Index of the annotation data to get.
+
+ Returns:
+ dict: Annotation information consists of the following keys:
+
+ - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+ 3D ground truth bboxes
+ - gt_labels_3d (np.ndarray): Labels of ground truths.
+ - gt_names (list[str]): Class names of ground truths.
+ """
+ info = self.data_infos[index]
+ # filter out bbox containing no points
+ if self.use_valid_flag:
+ mask = info['valid_flag']
+ else:
+ mask = info['num_lidar_pts'] > 0
+ gt_bboxes_3d = info['gt_boxes'][mask]
+ gt_names_3d = info['gt_names'][mask]
+ gt_labels_3d = []
+ for cat in gt_names_3d:
+ if cat in self.CLASSES:
+ gt_labels_3d.append(self.CLASSES.index(cat))
+ else:
+ gt_labels_3d.append(-1)
+ gt_labels_3d = np.array(gt_labels_3d)
+
+ if self.with_velocity:
+ gt_velocity = info['gt_velocity'][mask]
+ nan_mask = np.isnan(gt_velocity[:, 0])
+ gt_velocity[nan_mask] = [0.0, 0.0]
+ gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+ if self.with_attr:
+ gt_fut_trajs = info['gt_agent_fut_trajs'][mask]
+ gt_fut_masks = info['gt_agent_fut_masks'][mask]
+ gt_fut_goal = info['gt_agent_fut_goal'][mask]
+ gt_lcf_feat = info['gt_agent_lcf_feat'][mask]
+ gt_fut_yaw = info['gt_agent_fut_yaw'][mask]
+ attr_labels = np.concatenate(
+ [gt_fut_trajs, gt_fut_masks, gt_fut_goal[..., None], gt_lcf_feat, gt_fut_yaw], axis=-1
+ ).astype(np.float32)
+
+ # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+ # the same as KITTI (0.5, 0.5, 0)
+ gt_bboxes_3d = LiDARInstance3DBoxes(
+ gt_bboxes_3d,
+ box_dim=gt_bboxes_3d.shape[-1],
+ origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+ anns_results = dict(
+ gt_bboxes_3d=gt_bboxes_3d,
+ gt_labels_3d=gt_labels_3d,
+ gt_names=gt_names_3d,
+ attr_labels=attr_labels)
+
+ return anns_results
+
+ def get_data_info(self, index):
+ """Get data info according to the given index.
+
+ Args:
+ index (int): Index of the sample data to get.
+
+ Returns:
+ dict: Data information that will be passed to the data \
+ preprocessing pipelines. It includes the following keys:
+
+ - sample_idx (str): Sample index.
+ - pts_filename (str): Filename of point clouds.
+ - sweeps (list[dict]): Infos of sweeps.
+ - timestamp (float): Sample timestamp.
+ - img_filename (str, optional): Image filename.
+ - lidar2img (list[np.ndarray], optional): Transformations \
+ from lidar to different cameras.
+ - ann_info (dict): Annotation info.
+ """
+ info = self.data_infos[index]
+ # standard protocal modified from SECOND.Pytorch
+ input_dict = dict(
+ sample_idx=info['token'],
+ #pts_filename=info['lidar_path'],
+ #sweeps=info['sweeps'],
+ ego2global_translation=info['ego2global_translation'],
+ ego2global_rotation=info['ego2global_rotation'],
+ lidar2ego_translation=info['lidar2ego_translation'],
+ lidar2ego_rotation=info['lidar2ego_rotation'],
+ prev_idx=info['prev'],
+ next_idx=info['next'],
+ scene_token=info['scene_token'],
+ can_bus=info['can_bus'],
+ frame_idx=info['frame_idx'],
+ timestamp=info['timestamp'] / 1e6,
+ fut_valid_flag=info['fut_valid_flag'],
+ map_location=info['map_location'],
+ ego_his_trajs=info['gt_ego_his_trajs'],
+ ego_fut_trajs=info['gt_ego_fut_trajs'],
+ ego_fut_masks=info['gt_ego_fut_masks'],
+ ego_fut_cmd=info['gt_ego_fut_cmd'],
+ ego_lcf_feat=info['gt_ego_lcf_feat']
+ )
+ # lidar to ego transform
+ lidar2ego = np.eye(4).astype(np.float32)
+ lidar2ego[:3, :3] = Quaternion(info["lidar2ego_rotation"]).rotation_matrix
+ lidar2ego[:3, 3] = info["lidar2ego_translation"]
+ input_dict["lidar2ego"] = lidar2ego
+
+ if self.modality['use_camera']:
+ image_paths = []
+ lidar2img_rts = []
+ lidar2cam_rts = []
+ cam_intrinsics = []
+ input_dict["camera2ego"] = []
+ input_dict["camera_intrinsics"] = []
+ for cam_type, cam_info in info['cams'].items():
+ image_paths.append(cam_info['data_path'])
+ # obtain lidar to image transformation matrix
+ lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+ lidar2cam_t = cam_info[
+ 'sensor2lidar_translation'] @ lidar2cam_r.T
+ lidar2cam_rt = np.eye(4)
+ lidar2cam_rt[:3, :3] = lidar2cam_r.T
+ lidar2cam_rt[3, :3] = -lidar2cam_t
+ intrinsic = cam_info['cam_intrinsic']
+ viewpad = np.eye(4)
+ viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+ lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+ lidar2img_rts.append(lidar2img_rt)
+
+ cam_intrinsics.append(viewpad)
+ lidar2cam_rts.append(lidar2cam_rt.T)
+
+ # camera to ego transform
+ camera2ego = np.eye(4).astype(np.float32)
+ camera2ego[:3, :3] = Quaternion(
+ cam_info["sensor2ego_rotation"]
+ ).rotation_matrix
+ camera2ego[:3, 3] = cam_info["sensor2ego_translation"]
+ input_dict["camera2ego"].append(camera2ego)
+ # camera intrinsics
+ camera_intrinsics = np.eye(4).astype(np.float32)
+ camera_intrinsics[:3, :3] = cam_info["cam_intrinsic"]
+ input_dict["camera_intrinsics"].append(camera_intrinsics)
+
+ input_dict.update(
+ dict(
+ img_filename=image_paths,
+ lidar2img=lidar2img_rts,
+ cam_intrinsic=cam_intrinsics,
+ lidar2cam=lidar2cam_rts,
+ ))
+
+ # NOTE: now we load gt in test_mode for evaluating
+ # if not self.test_mode:
+ # annos = self.get_ann_info(index)
+ # input_dict['ann_info'] = annos
+
+ annos = self.get_ann_info(index)
+ input_dict['ann_info'] = annos
+
+ rotation = Quaternion(input_dict['ego2global_rotation'])
+ translation = input_dict['ego2global_translation']
+ can_bus = input_dict['can_bus']
+ can_bus[:3] = translation
+ can_bus[3:7] = rotation
+ patch_angle = quaternion_yaw(rotation) / np.pi * 180
+ if patch_angle < 0:
+ patch_angle += 360
+ can_bus[-2] = patch_angle / 180 * np.pi
+ can_bus[-1] = patch_angle
+
+ lidar2ego = np.eye(4)
+ lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+ lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+ ego2global = np.eye(4)
+ ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+ ego2global[:3, 3] = input_dict['ego2global_translation']
+ lidar2global = ego2global @ lidar2ego
+ input_dict['lidar2global'] = lidar2global
+
+ return input_dict
+
+ def __getitem__(self, idx):
+ """Get item from infos according to the given index.
+ Returns:
+ dict: Data dictionary of the corresponding index.
+ """
+ if self.test_mode:
+ return self.prepare_test_data(idx)
+ while True:
+
+ data = self.prepare_train_data(idx)
+ if data is None:
+ idx = self._rand_another(idx)
+ continue
+ return data
+
+ def _format_gt(self):
+ gt_annos = []
+ print('Start to convert gt map format...')
+ # assert self.map_ann_file is not None
+ if (not os.path.exists(self.map_ann_file)) :
+ dataset_length = len(self)
+ prog_bar = mmcv.ProgressBar(dataset_length)
+ mapped_class_names = self.MAPCLASSES
+ for sample_id in range(dataset_length):
+ sample_token = self.data_infos[sample_id]['token']
+ gt_anno = {}
+ gt_anno['sample_token'] = sample_token
+ # gt_sample_annos = []
+ gt_sample_dict = {}
+ gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id])
+ gt_labels = gt_sample_dict['map_gt_labels_3d'].data.numpy()
+ gt_vecs = gt_sample_dict['map_gt_bboxes_3d'].data.instance_list
+ gt_vec_list = []
+ for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+ name = mapped_class_names[gt_label]
+ anno = dict(
+ pts=np.array(list(gt_vec.coords)),
+ pts_num=len(list(gt_vec.coords)),
+ cls_name=name,
+ type=gt_label,
+ )
+ gt_vec_list.append(anno)
+ gt_anno['vectors']=gt_vec_list
+ gt_annos.append(gt_anno)
+
+ prog_bar.update()
+ nusc_submissions = {
+ 'GTs': gt_annos
+ }
+ print('\n GT anns writes to', self.map_ann_file)
+ dump(nusc_submissions, self.map_ann_file)
+ else:
+ print(f'{self.map_ann_file} exist, not update')
+
+ def _format_bbox(self, results, jsonfile_prefix=None, score_thresh=0.2):
+ """Convert the results to the standard format.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str): The prefix of the output jsonfile.
+ You can specify the output directory/filename by
+ modifying the jsonfile_prefix. Default: None.
+
+ Returns:
+ str: Path of the output json file.
+ """
+ nusc_annos = {}
+ det_mapped_class_names = self.CLASSES
+
+ # assert self.map_ann_file is not None
+ map_pred_annos = {}
+ map_mapped_class_names = self.MAPCLASSES
+
+ plan_annos = {}
+
+ print('Start to convert detection format...')
+ for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+ annos = []
+ boxes = output_to_nusc_box(det)
+ sample_token = self.data_infos[sample_id]['token']
+
+ plan_annos[sample_token] = [det['ego_fut_preds'], det['ego_fut_cmd']]
+
+ boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+ det_mapped_class_names,
+ self.custom_eval_detection_configs,
+ self.eval_version)
+ for i, box in enumerate(boxes):
+ if box.score < score_thresh:
+ continue
+ name = det_mapped_class_names[box.label]
+ if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+ if name in [
+ 'car',
+ 'construction_vehicle',
+ 'bus',
+ 'truck',
+ 'trailer',
+ ]:
+ attr = 'vehicle.moving'
+ elif name in ['bicycle', 'motorcycle']:
+ attr = 'cycle.with_rider'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+ else:
+ if name in ['pedestrian']:
+ attr = 'pedestrian.standing'
+ elif name in ['bus']:
+ attr = 'vehicle.stopped'
+ else:
+ attr = NuScenesDataset.DefaultAttribute[name]
+
+ nusc_anno = dict(
+ sample_token=sample_token,
+ translation=box.center.tolist(),
+ size=box.wlh.tolist(),
+ rotation=box.orientation.elements.tolist(),
+ velocity=box.velocity[:2].tolist(),
+ detection_name=name,
+ detection_score=box.score,
+ attribute_name=attr,
+ fut_traj=box.fut_trajs.tolist())
+ annos.append(nusc_anno)
+ nusc_annos[sample_token] = annos
+
+
+ map_pred_anno = {}
+ vecs = output_to_vecs(det)
+ sample_token = self.data_infos[sample_id]['token']
+ map_pred_anno['sample_token'] = sample_token
+ pred_vec_list=[]
+ for i, vec in enumerate(vecs):
+ name = map_mapped_class_names[vec['label']]
+ anno = dict(
+ # sample_token=sample_token,
+ pts=vec['pts'],
+ pts_num=len(vec['pts']),
+ cls_name=name,
+ type=vec['label'],
+ confidence_level=vec['score'])
+ pred_vec_list.append(anno)
+ # annos.append(nusc_anno)
+ # nusc_annos[sample_token] = annos
+ map_pred_anno['vectors'] = pred_vec_list
+ map_pred_annos[sample_token] = map_pred_anno
+
+ if not os.path.exists(self.map_ann_file):
+ self._format_gt()
+ else:
+ print(f'{self.map_ann_file} exist, not update')
+ # with open(self.map_ann_file,'r') as f:
+ # GT_anns = json.load(f)
+ # gt_annos = GT_anns['GTs']
+
+ nusc_submissions = {
+ 'meta': self.modality,
+ 'results': nusc_annos,
+ 'map_results': map_pred_annos,
+ 'plan_results': plan_annos
+ # 'GTs': gt_annos
+ }
+
+ mmcv.mkdir_or_exist(jsonfile_prefix)
+ if self.use_pkl_result:
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.pkl')
+ else:
+ res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+ print('Results writes to', res_path)
+ dump(nusc_submissions, res_path)
+ return res_path
+
+ def format_results(self, results, jsonfile_prefix=None):
+ """Format the results to json (standard format for COCO evaluation).
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+
+ Returns:
+ tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+ dict containing the json filepaths, `tmp_dir` is the temporal \
+ directory created for saving json files when \
+ `jsonfile_prefix` is not specified.
+ """
+ if isinstance(results, dict):
+ # print(f'results must be a list, but get dict, keys={results.keys()}')
+ # assert isinstance(results, list)
+ results = results['bbox_results']
+ assert isinstance(results, list)
+ # assert len(results) == len(self), (
+ # 'The length of results is not equal to the dataset len: {} != {}'.
+ # format(len(results), len(self)))
+
+ if jsonfile_prefix is None:
+ tmp_dir = tempfile.TemporaryDirectory()
+ jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+ else:
+ tmp_dir = None
+
+ # currently the output prediction results could be in two formats
+ # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+ # 2. list of dict('pts_bbox' or 'img_bbox':
+ # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+ # this is a workaround to enable evaluation of both formats on nuScenes
+ # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+ if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+ result_files = self._format_bbox(results, jsonfile_prefix)
+ else:
+ # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+ result_files = dict()
+ for name in results[0]:
+ if name == 'metric_results':
+ continue
+ print(f'\nFormating bboxes of {name}')
+ results_ = [out[name] for out in results]
+ tmp_file_ = osp.join(jsonfile_prefix, name)
+ result_files.update(
+ {name: self._format_bbox(results_, tmp_file_)})
+ return result_files, tmp_dir
+
+ def _evaluate_single(self,
+ result_path,
+ logger=None,
+ metric='bbox',
+ map_metric='chamfer',
+ result_name='pts_bbox'):
+ """Evaluation for a single model in nuScenes protocol.
+
+ Args:
+ result_path (str): Path of the result file.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ metric (str): Metric name used for evaluation. Default: 'bbox'.
+ result_name (str): Result name in the metric prefix.
+ Default: 'pts_bbox'.
+
+ Returns:
+ dict: Dictionary of evaluation details.
+ """
+ detail = dict()
+ from nuscenes import NuScenes
+ self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+ verbose=False)
+
+ output_dir = osp.join(*osp.split(result_path)[:-1])
+
+ eval_set_map = {
+ 'v1.0-mini': 'mini_val',
+ 'v1.0-trainval': 'val',
+ }
+ self.nusc_eval = NuScenesEval_custom(
+ self.nusc,
+ config=self.custom_eval_detection_configs,
+ result_path=result_path,
+ eval_set=eval_set_map[self.version],
+ output_dir=output_dir,
+ verbose=False,
+ overlap_test=self.overlap_test,
+ data_infos=self.data_infos
+ )
+ self.nusc_eval.main(plot_examples=0, render_curves=False)
+ # record metrics
+ metrics = load(osp.join(output_dir, 'metrics_summary.json'))
+ metric_prefix = f'{result_name}_NuScenes'
+ for name in self.CLASSES:
+ for k, v in metrics['label_aps'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['label_tp_errors'][name].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+ for k, v in metrics['tp_errors'].items():
+ val = float('{:.4f}'.format(v))
+ detail['{}/{}'.format(metric_prefix,
+ self.ErrNameMapping[k])] = val
+ detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+ detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+
+
+ from mmcv.datasets.map_utils.mean_ap import eval_map
+ from mmcv.datasets.map_utils.mean_ap import format_res_gt_by_classes
+ result_path = osp.abspath(result_path)
+
+ print('Formating results & gts by classes')
+ pred_results = load(result_path)
+ map_results = pred_results['map_results']
+ gt_anns = load(self.map_ann_file)
+ map_annotations = gt_anns['GTs']
+ cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+ map_results,
+ map_annotations,
+ cls_names=self.MAPCLASSES,
+ num_pred_pts_per_instance=self.fixed_num,
+ eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag,
+ pc_range=self.pc_range)
+ map_metrics = map_metric if isinstance(map_metric, list) else [map_metric]
+ allowed_metrics = ['chamfer', 'iou']
+ for metric in map_metrics:
+ if metric not in allowed_metrics:
+ raise KeyError(f'metric {metric} is not supported')
+ for metric in map_metrics:
+ print('-*'*10+f'use metric:{metric}'+'-*'*10)
+ if metric == 'chamfer':
+ thresholds = [0.5,1.0,1.5]
+ elif metric == 'iou':
+ thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+ cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+ for i, thr in enumerate(thresholds):
+ print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+ mAP, cls_ap = eval_map(
+ map_results,
+ map_annotations,
+ cls_gens,
+ cls_gts,
+ threshold=thr,
+ cls_names=self.MAPCLASSES,
+ logger=logger,
+ num_pred_pts_per_instance=self.fixed_num,
+ pc_range=self.pc_range,
+ metric=metric)
+ for j in range(self.NUM_MAPCLASSES):
+ cls_aps[i, j] = cls_ap[j]['ap']
+ for i, name in enumerate(self.MAPCLASSES):
+ print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+ detail['NuscMap_{}/{}_AP'.format(metric,name)] = cls_aps.mean(0)[i]
+ print('map: {}'.format(cls_aps.mean(0).mean()))
+ detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+ for i, name in enumerate(self.MAPCLASSES):
+ for j, thr in enumerate(thresholds):
+ if metric == 'chamfer':
+ detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+ elif metric == 'iou':
+ if thr == 0.5 or thr == 0.75:
+ detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+ return detail
+
+ def evaluate(self,
+ results,
+ metric='bbox',
+ map_metric='chamfer',
+ logger=None,
+ jsonfile_prefix=None,
+ result_names=['pts_bbox'],
+ show=False,
+ out_dir=None,
+ pipeline=None):
+ """Evaluation in nuScenes protocol.
+
+ Args:
+ results (list[dict]): Testing results of the dataset.
+ metric (str | list[str]): Metrics to be evaluated.
+ logger (logging.Logger | str | None): Logger used for printing
+ related information during evaluation. Default: None.
+ jsonfile_prefix (str | None): The prefix of json files. It includes
+ the file path and the prefix of filename, e.g., "a/b/prefix".
+ If not specified, a temp file will be created. Default: None.
+ show (bool): Whether to visualize.
+ Default: False.
+ out_dir (str): Path to save the visualization results.
+ Default: None.
+ pipeline (list[dict], optional): raw data loading for showing.
+ Default: None.
+
+ Returns:
+ dict[str, float]: Results of each evaluation metric.
+ """
+ result_metric_names = ['EPA', 'ADE', 'FDE', 'MR']
+ motion_cls_names = ['car', 'pedestrian']
+ motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+ 'fp', 'ADE', 'FDE', 'MR']
+ all_metric_dict = {}
+ for met in motion_metric_names:
+ for cls in motion_cls_names:
+ all_metric_dict[met+'_'+cls] = 0.0
+ result_dict = {}
+ for met in result_metric_names:
+ for cls in motion_cls_names:
+ result_dict[met+'_'+cls] = 0.0
+
+ alpha = 0.5
+
+ for i in range(len(results)):
+ for key in all_metric_dict.keys():
+ all_metric_dict[key] += results[i]['metric_results'][key]
+
+ for cls in motion_cls_names:
+ result_dict['EPA_'+cls] = (all_metric_dict['hit_'+cls] - \
+ alpha * all_metric_dict['fp_'+cls]) / all_metric_dict['gt_'+cls]
+ result_dict['ADE_'+cls] = all_metric_dict['ADE_'+cls] / all_metric_dict['cnt_ade_'+cls]
+ result_dict['FDE_'+cls] = all_metric_dict['FDE_'+cls] / all_metric_dict['cnt_fde_'+cls]
+ result_dict['MR_'+cls] = all_metric_dict['MR_'+cls] / all_metric_dict['cnt_fde_'+cls]
+
+ print('\n')
+ print('-------------- Motion Prediction --------------')
+ for k, v in result_dict.items():
+ print(f'{k}: {v}')
+
+ # NOTE: print planning metric
+ print('\n')
+ print('-------------- Planning --------------')
+ metric_dict = None
+ num_valid = 0
+ for res in results:
+ if res['metric_results']['fut_valid_flag']:
+ num_valid += 1
+ else:
+ continue
+ if metric_dict is None:
+ metric_dict = copy.deepcopy(res['metric_results'])
+ else:
+ for k in res['metric_results'].keys():
+ metric_dict[k] += res['metric_results'][k]
+
+ for k in metric_dict:
+ metric_dict[k] = metric_dict[k] / num_valid
+ print("{}:{}".format(k, metric_dict[k]))
+
+ result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+ if isinstance(result_files, dict):
+ results_dict = dict()
+ for name in result_names:
+ print('Evaluating bboxes of {}'.format(name))
+ ret_dict = self._evaluate_single(result_files[name], metric=metric, map_metric=map_metric)
+ results_dict.update(ret_dict)
+ elif isinstance(result_files, str):
+ results_dict = self._evaluate_single(result_files, metric=metric, map_metric=map_metric)
+
+ if tmp_dir is not None:
+ tmp_dir.cleanup()
+
+ if show:
+ self.show(results, out_dir, pipeline=pipeline)
+ return results_dict
+
+def output_to_nusc_box(detection):
+ """Convert the output to the box class in the nuScenes.
+
+ Args:
+ detection (dict): Detection results.
+
+ - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+ - scores_3d (torch.Tensor): Detection scores.
+ - labels_3d (torch.Tensor): Predicted box labels.
+
+ Returns:
+ list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+ """
+ box3d = detection['boxes_3d']
+ scores = detection['scores_3d'].numpy()
+ labels = detection['labels_3d'].numpy()
+ trajs = detection['trajs_3d'].numpy()
+
+
+ box_gravity_center = box3d.gravity_center.numpy()
+ box_dims = box3d.dims.numpy()
+ box_yaw = box3d.yaw.numpy()
+ # TODO: check whether this is necessary
+ # with dir_offset & dir_limit in the head
+ box_yaw = -box_yaw - np.pi / 2
+
+ box_list = []
+ for i in range(len(box3d)):
+ quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+ velocity = (*box3d.tensor[i, 7:9], 0.0)
+ # velo_val = np.linalg.norm(box3d[i, 7:9])
+ # velo_ori = box3d[i, 6]
+ # velocity = (
+ # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+ box = CustomNuscenesBox(
+ center=box_gravity_center[i],
+ size=box_dims[i],
+ orientation=quat,
+ fut_trajs=trajs[i],
+ label=labels[i],
+ score=scores[i],
+ velocity=velocity)
+ box_list.append(box)
+ return box_list
+
+
+def lidar_nusc_box_to_global(info,
+ boxes,
+ classes,
+ eval_configs,
+ eval_version='detection_cvpr_2019'):
+ """Convert the box from ego to global coordinate.
+
+ Args:
+ info (dict): Info for a specific sample data, including the
+ calibration information.
+ boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+ classes (list[str]): Mapped classes in the evaluation.
+ eval_configs (object): Evaluation configuration object.
+ eval_version (str): Evaluation version.
+ Default: 'detection_cvpr_2019'
+
+ Returns:
+ list: List of standard NuScenesBoxes in the global
+ coordinate.
+ """
+ box_list = []
+ for box in boxes:
+ # Move box to ego vehicle coord system
+ box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+ box.translate(np.array(info['lidar2ego_translation']))
+ # filter det in ego.
+ cls_range_x_map = eval_configs.class_range_x
+ cls_range_y_map = eval_configs.class_range_y
+ x_distance, y_distance = box.center[0], box.center[1]
+ det_range_x = cls_range_x_map[classes[box.label]]
+ det_range_y = cls_range_y_map[classes[box.label]]
+ if abs(x_distance) > det_range_x or abs(y_distance) > det_range_y:
+ continue
+ # Move box to global coord system
+ box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+ box.translate(np.array(info['ego2global_translation']))
+ box_list.append(box)
+ return box_list
+
+def output_to_vecs(detection):
+ box3d = detection['map_boxes_3d'].numpy()
+ scores = detection['map_scores_3d'].numpy()
+ labels = detection['map_labels_3d'].numpy()
+ pts = detection['map_pts_3d'].numpy()
+
+ vec_list = []
+ # import pdb;pdb.set_trace()
+ for i in range(box3d.shape[0]):
+ vec = dict(
+ bbox = box3d[i], # xyxy
+ label=labels[i],
+ score=scores[i],
+ pts=pts[i],
+ )
+ vec_list.append(vec)
+ return vec_list
\ No newline at end of file
diff --git a/mmcv/datasets/nuscnes_eval.py b/mmcv/datasets/nuscnes_eval.py
new file mode 100644
index 0000000..2b14535
--- /dev/null
+++ b/mmcv/datasets/nuscnes_eval.py
@@ -0,0 +1,756 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import torch
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.common.loaders import load_prediction, add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from torchvision.transforms.functional import rotate
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+ DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmcv.core.bbox import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+ PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+ metrics: DetectionMetrics,
+ detection_name: str,
+ min_recall: float,
+ dist_th_tp: float,
+ savepath: str = None,
+ ax: Axis = None) -> None:
+ """
+ Plot the true positive curve for the specified class.
+ :param md_list: DetectionMetricDataList instance.
+ :param metrics: DetectionMetrics instance.
+ :param detection_name:
+ :param min_recall: Minimum recall value.
+ :param dist_th_tp: The distance threshold used to determine matches.
+ :param savepath: If given, saves the the rendering here instead of displaying.
+ :param ax: Axes onto which to render.
+ """
+ # Get metric data for given detection class with tp distance threshold.
+
+ md = md_list[(detection_name, dist_th_tp)]
+ min_recall_ind = round(100 * min_recall)
+ if min_recall_ind <= md.max_recall_ind:
+ # For traffic_cone and barrier only a subset of the metrics are plotted.
+ rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+ ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+ else:
+ ylimit = 1.0
+
+ # Prepare axis.
+ if ax is None:
+ ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+ min_recall=min_recall)
+ ax.set_ylim(0, ylimit)
+
+ # Plot the recall vs. error curve for each tp metric.
+ for metric in TP_METRICS:
+ tp = metrics.get_label_tp(detection_name, metric)
+
+ # Plot only if we have valid data.
+ if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+ recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+ else:
+ recall, error = [], []
+
+ # Change legend based on tp value
+ if tp is np.nan:
+ label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+ elif min_recall_ind > md.max_recall_ind:
+ label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+ else:
+ label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+ if metric == 'trans_err':
+ label += f' ({md.max_recall_ind})' # add recall
+ print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+ ax.plot(recall, error, label=label)
+ ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+ ax.legend(loc='best')
+
+ if savepath is not None:
+ plt.savefig(savepath)
+ plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+ def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+ '''
+ add annotation token
+ '''
+ super().__init__(*args, **kwargs)
+ self.token = token
+ self.visibility = visibility
+ self.index = index
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'token': self.token,
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'visibility': self.visibility,
+ 'index': self.index
+
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(
+ token=content['token'],
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'],
+ visibility=content['visibility'],
+ index=content['index'],
+ )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible inside an image without accounting for occlusions.
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ center_3d = box.center.reshape(3, 1)
+ center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, center_img[1, :] > 0)
+ visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+ in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if vis_level == BoxVisibility.ALL:
+ return all(visible) and all(in_front)
+ elif vis_level == BoxVisibility.ANY:
+ return any(visible) and all(in_front)
+ elif vis_level == BoxVisibility.NONE:
+ return True
+ else:
+ raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+ vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible in images but not all corners in image .
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ corners_3d = box.corners()
+ corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, corners_img[1, :] > 0)
+ visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+ in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if any(visible) and not all(visible) and all(in_front):
+ return True
+ else:
+ return False
+
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+ """
+ Loads ground truth boxes from DB.
+ :param nusc: A NuScenes instance.
+ :param eval_split: The evaluation split for which we load GT boxes.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The GT boxes.
+ """
+
+ # Init.
+ if box_cls == DetectionBox_modified:
+ attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+ if verbose:
+ print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+ # Read out all sample_tokens in DB.
+ sample_tokens_all = [s['token'] for s in nusc.sample]
+ assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+ # Only keep samples from this split.
+ splits = create_splits_scenes()
+
+ # Check compatibility of split with nusc_version.
+ version = nusc.version
+ if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+ assert version.endswith('trainval'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split in {'mini_train', 'mini_val'}:
+ assert version.endswith('mini'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split == 'test':
+ assert version.endswith('test'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ else:
+ raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+ .format(eval_split))
+
+ if eval_split == 'test':
+ # Check that you aren't trying to cheat :).
+ assert len(nusc.sample_annotation) > 0, \
+ 'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+ index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ index_map[sample['token']] = index
+ index += 1
+
+ sample_tokens = []
+ for sample_token in sample_tokens_all:
+ scene_token = nusc.get('sample', sample_token)['scene_token']
+ scene_record = nusc.get('scene', scene_token)
+ if scene_record['name'] in splits[eval_split]:
+ sample_tokens.append(sample_token)
+
+ all_annotations = EvalBoxes()
+
+ # Load annotations and filter predictions and annotations.
+ tracking_id_set = set()
+ for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+ sample = nusc.get('sample', sample_token)
+ sample_annotation_tokens = sample['anns']
+
+ sample_boxes = []
+ for sample_annotation_token in sample_annotation_tokens:
+
+ sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+ if box_cls == DetectionBox_modified:
+ # Get label name in detection task and filter unused labels.
+ detection_name = category_to_detection_name(sample_annotation['category_name'])
+ if detection_name is None:
+ continue
+
+ # Get attribute_name.
+ attr_tokens = sample_annotation['attribute_tokens']
+ attr_count = len(attr_tokens)
+ if attr_count == 0:
+ attribute_name = ''
+ elif attr_count == 1:
+ attribute_name = attribute_map[attr_tokens[0]]
+ else:
+ raise Exception('Error: GT annotations must not have more than one attribute!')
+
+ sample_boxes.append(
+ box_cls(
+ token=sample_annotation_token,
+ sample_token=sample_token,
+ translation=sample_annotation['translation'],
+ size=sample_annotation['size'],
+ rotation=sample_annotation['rotation'],
+ velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+ num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+ detection_name=detection_name,
+ detection_score=-1.0, # GT samples do not have a score.
+ attribute_name=attribute_name,
+ visibility=sample_annotation['visibility_token'],
+ index=index_map[sample_token]
+ )
+ )
+ elif box_cls == TrackingBox:
+ assert False
+ else:
+ raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+ all_annotations.add_boxes(sample_token, sample_boxes)
+
+ if verbose:
+ print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+ return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ id=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.token in id:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+ ori_eval_boxes: EvalBoxes,
+ visibility=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.visibility == visibility:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After visibility based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False):
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ for sample_token in eval_boxes.sample_tokens:
+ if sample_token not in valid_sample_tokens:
+ eval_boxes.boxes.pop(sample_token)
+ return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. basedon overlap .
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ sample_record = nusc.get('sample', sample_token)
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ count = 0
+ for cam in cams:
+ '''
+ copy-paste form nuscens
+ '''
+ sample_data_token = sample_record['data'][cam]
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+ name=box.detection_name, token='')
+
+ # Move box to ego vehicle coord system.
+ new_box.translate(-np.array(pose_record['translation']))
+ new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ new_box.translate(-np.array(cs_record['translation']))
+ new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ count += 1
+ # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ # count += 1
+
+ if count > 1:
+ with open('center_overlap.txt', 'a') as f:
+ try:
+ f.write(box.token + '\n')
+ except:
+ pass
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ verbose = True
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+ """
+ Dummy class for backward-compatibility. Same as DetectionEval.
+ """
+
+ def __init__(self,
+ nusc: NuScenes,
+ config: DetectionConfig,
+ result_path: str,
+ eval_set: str,
+ output_dir: str = None,
+ verbose: bool = True,
+ overlap_test=False,
+ eval_mask=False,
+ data_infos=None
+ ):
+ """
+ Initialize a DetectionEval object.
+ :param nusc: A NuScenes object.
+ :param config: A DetectionConfig object.
+ :param result_path: Path of the nuScenes JSON result file.
+ :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+ :param output_dir: Folder to save plots and results to.
+ :param verbose: Whether to print to stdout.
+ """
+
+ self.nusc = nusc
+ self.result_path = result_path
+ self.eval_set = eval_set
+ self.output_dir = output_dir
+ self.verbose = verbose
+ self.cfg = config
+ self.overlap_test = overlap_test
+ self.eval_mask = eval_mask
+ self.data_infos = data_infos
+ # Check result file exists.
+ assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+ # Make dirs.
+ self.plot_dir = os.path.join(self.output_dir, 'plots')
+ if not os.path.isdir(self.output_dir):
+ os.makedirs(self.output_dir)
+ if not os.path.isdir(self.plot_dir):
+ os.makedirs(self.plot_dir)
+
+ # Load data.
+ if verbose:
+ print('Initializing nuScenes detection evaluation')
+ self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+ verbose=verbose)
+ self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+ assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+ "Samples in split doesn't match samples in predictions."
+
+ # Add center distances.
+ # self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+ # self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+ import pdb
+ pdb.set_trace()
+
+
+
+ # Filter boxes (distance, points per box, etc.).
+
+ if verbose:
+ print('Filtering predictions')
+ self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+ if verbose:
+ print('Filtering ground truth annotations')
+ self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+ if self.overlap_test:
+ self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+ self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+ self.all_gt = copy.deepcopy(self.gt_boxes)
+ self.all_preds = copy.deepcopy(self.pred_boxes)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+ self.index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ self.index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ self.index_map[sample['token']] = index
+ index += 1
+
+ def update_gt(self, type_='vis', visibility='1', index=1):
+ if type_ == 'vis':
+ self.visibility_test = True
+ if self.visibility_test:
+ '''[{'description': 'visibility of whole object is between 0 and 40%',
+ 'token': '1',
+ 'level': 'v0-40'},
+ {'description': 'visibility of whole object is between 40 and 60%',
+ 'token': '2',
+ 'level': 'v40-60'},
+ {'description': 'visibility of whole object is between 60 and 80%',
+ 'token': '3',
+ 'level': 'v60-80'},
+ {'description': 'visibility of whole object is between 80 and 100%',
+ 'token': '4',
+ 'level': 'v80-100'}]'''
+
+ self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+ elif type_ == 'ord':
+
+ valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+ # from IPython import embed
+ # embed()
+ self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+ self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+ def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMetricDataList()
+
+ # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+ # self.cfg.dist_ths = [0.3]
+ # self.cfg.dist_fcn_callable
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMetrics(self.cfg)
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+ """
+ Renders various PR and TP curves.
+ :param metrics: DetectionMetrics instance.
+ :param md_list: DetectionMetricDataList instance.
+ """
+ if self.verbose:
+ print('Rendering PR and TP curves')
+
+ def savepath(name):
+ return os.path.join(self.plot_dir, name + '.pdf')
+
+ summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+ dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+ for detection_name in self.cfg.class_names:
+ class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath(detection_name + '_pr'))
+
+ class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+ savepath=savepath(detection_name + '_tp'))
+
+ for dist_th in self.cfg.dist_ths:
+ dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+ # Settings.
+ parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+ parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+ help='Folder to store result metrics, graphs and example visualizations.')
+ parser.add_argument('--eval_set', type=str, default='val',
+ help='Which dataset split to evaluate on, train, val or test.')
+ parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+ help='Default nuScenes data directory.')
+ parser.add_argument('--version', type=str, default='v1.0-trainval',
+ help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+ parser.add_argument('--config_path', type=str, default='',
+ help='Path to the configuration file.'
+ 'If no path given, the CVPR 2019 configuration will be used.')
+ parser.add_argument('--plot_examples', type=int, default=0,
+ help='How many example visualizations to write to disk.')
+ parser.add_argument('--render_curves', type=int, default=1,
+ help='Whether to render PR and TP curves to disk.')
+ parser.add_argument('--verbose', type=int, default=1,
+ help='Whether to print to stdout.')
+ args = parser.parse_args()
+
+ result_path_ = os.path.expanduser(args.result_path)
+ output_dir_ = os.path.expanduser(args.output_dir)
+ eval_set_ = args.eval_set
+ dataroot_ = args.dataroot
+ version_ = args.version
+ config_path = args.config_path
+ plot_examples_ = args.plot_examples
+ render_curves_ = bool(args.render_curves)
+ verbose_ = bool(args.verbose)
+
+ if config_path == '':
+ cfg_ = config_factory('detection_cvpr_2019')
+ else:
+ with open(config_path, 'r') as _f:
+ cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+ nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+ nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+ output_dir=output_dir_, verbose=verbose_)
+ for vis in ['1', '2', '3', '4']:
+ nusc_eval.update_gt(type_='vis', visibility=vis)
+ print(f'================ {vis} ===============')
+ nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+ #for index in range(1, 41):
+ # nusc_eval.update_gt(type_='ord', index=index)
+ #
diff --git a/mmcv/datasets/pipelines/__init__.py b/mmcv/datasets/pipelines/__init__.py
new file mode 100644
index 0000000..04e195d
--- /dev/null
+++ b/mmcv/datasets/pipelines/__init__.py
@@ -0,0 +1,50 @@
+from .compose import Compose
+from .formating import (Collect, Collect3D, DefaultFormatBundle, DefaultFormatBundle3D,
+ CustomDefaultFormatBundle3D, ImageToTensor,
+ ToDataContainer, ToTensor, Transpose, to_tensor,VADFormatBundle3D)
+from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam,
+ LoadMultiChannelImageFromFiles, LoadProposals,
+ LoadAnnotations3D, LoadImageFromFileMono3D,
+ LoadMultiViewImageFromFiles, LoadPointsFromFile,
+ LoadPointsFromMultiSweeps, NormalizePointsColor,
+ PointSegClassMapping, LoadAnnotations3D_E2E, CustomLoadPointsFromMultiSweeps, CustomLoadPointsFromFile)
+from .test_time_aug import MultiScaleFlipAug, MultiScaleFlipAug3D
+from .transforms_3d import (BackgroundPointsFilter, GlobalAlignment,
+ GlobalRotScaleTrans, IndoorPatchPointSample,
+ IndoorPointSample, ObjectNameFilter, ObjectNoise,
+ ObjectRangeFilter, ObjectSample, PointSample,
+ PointShuffle, PointsRangeFilter,
+ RandomDropPointsColor, RandomFlip3D,
+ RandomJitterPoints, VoxelBasedPointSampler,
+ PadMultiViewImage, NormalizeMultiviewImage,
+ PhotoMetricDistortionMultiViewImage, CustomCollect3D,
+ RandomScaleImageMultiViewImage,VADObjectRangeFilter,VADObjectNameFilter,CustomPointsRangeFilter)
+from .transforms import (Albu, CutOut, Expand, MinIoURandomCrop, Normalize,
+ Pad, PhotoMetricDistortion, RandomCenterCropPad,
+ RandomCrop, RandomFlip, RandomShift, Resize,
+ SegRescale)
+from .occflow_label import GenerateOccFlowLabels
+
+# __all__ = [
+# 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+# 'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations',
+# 'LoadImageFromFile', 'LoadImageFromWebcam',
+# 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug',
+# 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale',
+# 'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu',
+# 'InstaBoost', 'RandomCenterCropPad', 'AutoAugment', 'CutOut', 'Shear',
+# 'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform',
+# 'ContrastTransform', 'Translate', 'RandomShift',
+# 'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+# 'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+# 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
+# 'DefaultFormatBundle3D', 'DataBaseSampler',
+# 'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
+# 'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
+# 'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
+# 'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
+# 'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
+# 'RandomJitterPoints', 'CustomDefaultFormatBundle3D', 'LoadAnnotations3D_E2E',
+# 'GenerateOccFlowLabels', 'PadMultiViewImage', 'NormalizeMultiviewImage',
+# 'PhotoMetricDistortionMultiViewImage', 'CustomCollect3D', 'RandomScaleImageMultiViewImage'
+# ]
diff --git a/mmcv/datasets/pipelines/compose.py b/mmcv/datasets/pipelines/compose.py
new file mode 100644
index 0000000..1567530
--- /dev/null
+++ b/mmcv/datasets/pipelines/compose.py
@@ -0,0 +1,51 @@
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose:
+ """Compose multiple transforms sequentially.
+
+ Args:
+ transforms (Sequence[dict | callable]): Sequence of transform object or
+ config dict to be composed.
+ """
+
+ def __init__(self, transforms):
+ assert isinstance(transforms, collections.abc.Sequence)
+ self.transforms = []
+ for transform in transforms:
+ if isinstance(transform, dict):
+ transform = build_from_cfg(transform, PIPELINES)
+ self.transforms.append(transform)
+ elif callable(transform):
+ self.transforms.append(transform)
+ else:
+ raise TypeError('transform must be callable or a dict')
+
+ def __call__(self, data):
+ """Call function to apply transforms sequentially.
+
+ Args:
+ data (dict): A result dict contains the data to transform.
+
+ Returns:
+ dict: Transformed data.
+ """
+
+ for t in self.transforms:
+ data = t(data)
+ if data is None:
+ return None
+ return data
+
+ def __repr__(self):
+ format_string = self.__class__.__name__ + '('
+ for t in self.transforms:
+ format_string += '\n'
+ format_string += f' {t}'
+ format_string += '\n)'
+ return format_string
diff --git a/mmcv/datasets/pipelines/data_augment_utils.py b/mmcv/datasets/pipelines/data_augment_utils.py
new file mode 100644
index 0000000..231ab80
--- /dev/null
+++ b/mmcv/datasets/pipelines/data_augment_utils.py
@@ -0,0 +1,409 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+import warnings
+from numba.errors import NumbaPerformanceWarning
+
+from mmcv.core.bbox import box_np_ops
+
+warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)
+
+
+@numba.njit
+def _rotation_box2d_jit_(corners, angle, rot_mat_T):
+ """Rotate 2D boxes.
+
+ Args:
+ corners (np.ndarray): Corners of boxes.
+ angle (float): Rotation angle.
+ rot_mat_T (np.ndarray): Transposed rotation matrix.
+ """
+ rot_sin = np.sin(angle)
+ rot_cos = np.cos(angle)
+ rot_mat_T[0, 0] = rot_cos
+ rot_mat_T[0, 1] = -rot_sin
+ rot_mat_T[1, 0] = rot_sin
+ rot_mat_T[1, 1] = rot_cos
+ corners[:] = corners @ rot_mat_T
+
+
+@numba.jit(nopython=True)
+def box_collision_test(boxes, qboxes, clockwise=True):
+ """Box collision test.
+
+ Args:
+ boxes (np.ndarray): Corners of current boxes.
+ qboxes (np.ndarray): Boxes to be avoid colliding.
+ clockwise (bool): Whether the corners are in clockwise order.
+ Default: True.
+ """
+ N = boxes.shape[0]
+ K = qboxes.shape[0]
+ ret = np.zeros((N, K), dtype=np.bool_)
+ slices = np.array([1, 2, 3, 0])
+ lines_boxes = np.stack((boxes, boxes[:, slices, :]),
+ axis=2) # [N, 4, 2(line), 2(xy)]
+ lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
+ # vec = np.zeros((2,), dtype=boxes.dtype)
+ boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
+ qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
+ for i in range(N):
+ for j in range(K):
+ # calculate standup first
+ iw = (
+ min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
+ max(boxes_standup[i, 0], qboxes_standup[j, 0]))
+ if iw > 0:
+ ih = (
+ min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
+ max(boxes_standup[i, 1], qboxes_standup[j, 1]))
+ if ih > 0:
+ for k in range(4):
+ for box_l in range(4):
+ A = lines_boxes[i, k, 0]
+ B = lines_boxes[i, k, 1]
+ C = lines_qboxes[j, box_l, 0]
+ D = lines_qboxes[j, box_l, 1]
+ acd = (D[1] - A[1]) * (C[0] -
+ A[0]) > (C[1] - A[1]) * (
+ D[0] - A[0])
+ bcd = (D[1] - B[1]) * (C[0] -
+ B[0]) > (C[1] - B[1]) * (
+ D[0] - B[0])
+ if acd != bcd:
+ abc = (C[1] - A[1]) * (B[0] - A[0]) > (
+ B[1] - A[1]) * (
+ C[0] - A[0])
+ abd = (D[1] - A[1]) * (B[0] - A[0]) > (
+ B[1] - A[1]) * (
+ D[0] - A[0])
+ if abc != abd:
+ ret[i, j] = True # collision.
+ break
+ if ret[i, j] is True:
+ break
+ if ret[i, j] is False:
+ # now check complete overlap.
+ # box overlap qbox:
+ box_overlap_qbox = True
+ for box_l in range(4): # point l in qboxes
+ for k in range(4): # corner k in boxes
+ vec = boxes[i, k] - boxes[i, (k + 1) % 4]
+ if clockwise:
+ vec = -vec
+ cross = vec[1] * (
+ boxes[i, k, 0] - qboxes[j, box_l, 0])
+ cross -= vec[0] * (
+ boxes[i, k, 1] - qboxes[j, box_l, 1])
+ if cross >= 0:
+ box_overlap_qbox = False
+ break
+ if box_overlap_qbox is False:
+ break
+
+ if box_overlap_qbox is False:
+ qbox_overlap_box = True
+ for box_l in range(4): # point box_l in boxes
+ for k in range(4): # corner k in qboxes
+ vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
+ if clockwise:
+ vec = -vec
+ cross = vec[1] * (
+ qboxes[j, k, 0] - boxes[i, box_l, 0])
+ cross -= vec[0] * (
+ qboxes[j, k, 1] - boxes[i, box_l, 1])
+ if cross >= 0: #
+ qbox_overlap_box = False
+ break
+ if qbox_overlap_box is False:
+ break
+ if qbox_overlap_box:
+ ret[i, j] = True # collision.
+ else:
+ ret[i, j] = True # collision.
+ return ret
+
+
+@numba.njit
+def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
+ """Add noise to every box (only on the horizontal plane).
+
+ Args:
+ boxes (np.ndarray): Input boxes with shape (N, 5).
+ valid_mask (np.ndarray): Mask to indicate which boxes are valid
+ with shape (N).
+ loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+ rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+ Returns:
+ np.ndarray: Mask to indicate whether the noise is
+ added successfully (pass the collision test).
+ """
+ num_boxes = boxes.shape[0]
+ num_tests = loc_noises.shape[1]
+ box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+ current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+ rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+ success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+ # print(valid_mask)
+ for i in range(num_boxes):
+ if valid_mask[i]:
+ for j in range(num_tests):
+ current_corners[:] = box_corners[i]
+ current_corners -= boxes[i, :2]
+ _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+ rot_mat_T)
+ current_corners += boxes[i, :2] + loc_noises[i, j, :2]
+ coll_mat = box_collision_test(
+ current_corners.reshape(1, 4, 2), box_corners)
+ coll_mat[0, i] = False
+ # print(coll_mat)
+ if not coll_mat.any():
+ success_mask[i] = j
+ box_corners[i] = current_corners
+ break
+ return success_mask
+
+
+@numba.njit
+def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
+ global_rot_noises):
+ """Add noise to every box (only on the horizontal plane). Version 2 used
+ when enable global rotations.
+
+ Args:
+ boxes (np.ndarray): Input boxes with shape (N, 5).
+ valid_mask (np.ndarray): Mask to indicate which boxes are valid
+ with shape (N).
+ loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+ rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+ Returns:
+ np.ndarray: Mask to indicate whether the noise is
+ added successfully (pass the collision test).
+ """
+ num_boxes = boxes.shape[0]
+ num_tests = loc_noises.shape[1]
+ box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+ current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+ current_box = np.zeros((1, 5), dtype=boxes.dtype)
+ rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+ dst_pos = np.zeros((2, ), dtype=boxes.dtype)
+ success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+ corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+ corners_norm[1, 1] = 1.0
+ corners_norm[2] = 1.0
+ corners_norm[3, 0] = 1.0
+ corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+ corners_norm = corners_norm.reshape(4, 2)
+ for i in range(num_boxes):
+ if valid_mask[i]:
+ for j in range(num_tests):
+ current_box[0, :] = boxes[i]
+ current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
+ current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
+ dst_grot = current_grot + global_rot_noises[i, j]
+ dst_pos[0] = current_radius * np.sin(dst_grot)
+ dst_pos[1] = current_radius * np.cos(dst_grot)
+ current_box[0, :2] = dst_pos
+ current_box[0, -1] += (dst_grot - current_grot)
+
+ rot_sin = np.sin(current_box[0, -1])
+ rot_cos = np.cos(current_box[0, -1])
+ rot_mat_T[0, 0] = rot_cos
+ rot_mat_T[0, 1] = -rot_sin
+ rot_mat_T[1, 0] = rot_sin
+ rot_mat_T[1, 1] = rot_cos
+ current_corners[:] = current_box[
+ 0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
+ current_corners -= current_box[0, :2]
+ _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+ rot_mat_T)
+ current_corners += current_box[0, :2] + loc_noises[i, j, :2]
+ coll_mat = box_collision_test(
+ current_corners.reshape(1, 4, 2), box_corners)
+ coll_mat[0, i] = False
+ if not coll_mat.any():
+ success_mask[i] = j
+ box_corners[i] = current_corners
+ loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
+ rot_noises[i, j] += (dst_grot - current_grot)
+ break
+ return success_mask
+
+
+def _select_transform(transform, indices):
+ """Select transform.
+
+ Args:
+ transform (np.ndarray): Transforms to select from.
+ indices (np.ndarray): Mask to indicate which transform to select.
+
+ Returns:
+ np.ndarray: Selected transforms.
+ """
+ result = np.zeros((transform.shape[0], *transform.shape[2:]),
+ dtype=transform.dtype)
+ for i in range(transform.shape[0]):
+ if indices[i] != -1:
+ result[i] = transform[i, indices[i]]
+ return result
+
+
+@numba.njit
+def _rotation_matrix_3d_(rot_mat_T, angle, axis):
+ """Get the 3D rotation matrix.
+
+ Args:
+ rot_mat_T (np.ndarray): Transposed rotation matrix.
+ angle (float): Rotation angle.
+ axis (int): Rotation axis.
+ """
+ rot_sin = np.sin(angle)
+ rot_cos = np.cos(angle)
+ rot_mat_T[:] = np.eye(3)
+ if axis == 1:
+ rot_mat_T[0, 0] = rot_cos
+ rot_mat_T[0, 2] = -rot_sin
+ rot_mat_T[2, 0] = rot_sin
+ rot_mat_T[2, 2] = rot_cos
+ elif axis == 2 or axis == -1:
+ rot_mat_T[0, 0] = rot_cos
+ rot_mat_T[0, 1] = -rot_sin
+ rot_mat_T[1, 0] = rot_sin
+ rot_mat_T[1, 1] = rot_cos
+ elif axis == 0:
+ rot_mat_T[1, 1] = rot_cos
+ rot_mat_T[1, 2] = -rot_sin
+ rot_mat_T[2, 1] = rot_sin
+ rot_mat_T[2, 2] = rot_cos
+
+
+@numba.njit
+def points_transform_(points, centers, point_masks, loc_transform,
+ rot_transform, valid_mask):
+ """Apply transforms to points and box centers.
+
+ Args:
+ points (np.ndarray): Input points.
+ centers (np.ndarray): Input box centers.
+ point_masks (np.ndarray): Mask to indicate which points need
+ to be transformed.
+ loc_transform (np.ndarray): Location transform to be applied.
+ rot_transform (np.ndarray): Rotation transform to be applied.
+ valid_mask (np.ndarray): Mask to indicate which boxes are valid.
+ """
+ num_box = centers.shape[0]
+ num_points = points.shape[0]
+ rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
+ for i in range(num_box):
+ _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
+ for i in range(num_points):
+ for j in range(num_box):
+ if valid_mask[j]:
+ if point_masks[i, j] == 1:
+ points[i, :3] -= centers[j, :3]
+ points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
+ points[i, :3] += centers[j, :3]
+ points[i, :3] += loc_transform[j]
+ break # only apply first box's transform
+
+
+@numba.njit
+def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
+ """Transform 3D boxes.
+
+ Args:
+ boxes (np.ndarray): 3D boxes to be transformed.
+ loc_transform (np.ndarray): Location transform to be applied.
+ rot_transform (np.ndarray): Rotation transform to be applied.
+ valid_mask (np.ndarray | None): Mask to indicate which boxes are valid.
+ """
+ num_box = boxes.shape[0]
+ for i in range(num_box):
+ if valid_mask[i]:
+ boxes[i, :3] += loc_transform[i]
+ boxes[i, 6] += rot_transform[i]
+
+
+def noise_per_object_v3_(gt_boxes,
+ points=None,
+ valid_mask=None,
+ rotation_perturb=np.pi / 4,
+ center_noise_std=1.0,
+ global_random_rot_range=np.pi / 4,
+ num_try=100):
+ """Random rotate or remove each groundtruth independently. use kitti viewer
+ to test this function points_transform_
+
+ Args:
+ gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7).
+ points (np.ndarray | None): Input point cloud with shape (M, 4).
+ Default: None.
+ valid_mask (np.ndarray | None): Mask to indicate which boxes are valid.
+ Default: None.
+ rotation_perturb (float): Rotation perturbation. Default: pi / 4.
+ center_noise_std (float): Center noise standard deviation.
+ Default: 1.0.
+ global_random_rot_range (float): Global random rotation range.
+ Default: pi/4.
+ num_try (int): Number of try. Default: 100.
+ """
+ num_boxes = gt_boxes.shape[0]
+ if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
+ rotation_perturb = [-rotation_perturb, rotation_perturb]
+ if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
+ global_random_rot_range = [
+ -global_random_rot_range, global_random_rot_range
+ ]
+ enable_grot = np.abs(global_random_rot_range[0] -
+ global_random_rot_range[1]) >= 1e-3
+
+ if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
+ center_noise_std = [
+ center_noise_std, center_noise_std, center_noise_std
+ ]
+ if valid_mask is None:
+ valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
+ center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
+
+ loc_noises = np.random.normal(
+ scale=center_noise_std, size=[num_boxes, num_try, 3])
+ rot_noises = np.random.uniform(
+ rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
+ gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
+ grot_lowers = global_random_rot_range[0] - gt_grots
+ grot_uppers = global_random_rot_range[1] - gt_grots
+ global_rot_noises = np.random.uniform(
+ grot_lowers[..., np.newaxis],
+ grot_uppers[..., np.newaxis],
+ size=[num_boxes, num_try])
+
+ origin = (0.5, 0.5, 0)
+ gt_box_corners = box_np_ops.center_to_corner_box3d(
+ gt_boxes[:, :3],
+ gt_boxes[:, 3:6],
+ gt_boxes[:, 6],
+ origin=origin,
+ axis=2)
+
+ # TODO: rewrite this noise box function?
+ if not enable_grot:
+ selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
+ valid_mask, loc_noises, rot_noises)
+ else:
+ selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
+ valid_mask, loc_noises, rot_noises,
+ global_rot_noises)
+
+ loc_transforms = _select_transform(loc_noises, selected_noise)
+ rot_transforms = _select_transform(rot_noises, selected_noise)
+ surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
+ if points is not None:
+ # TODO: replace this points_in_convex function by my tools?
+ point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
+ points[:, :3], surfaces)
+ points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
+ rot_transforms, valid_mask)
+
+ box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
diff --git a/mmcv/datasets/pipelines/formating.py b/mmcv/datasets/pipelines/formating.py
new file mode 100644
index 0000000..a7b3e61
--- /dev/null
+++ b/mmcv/datasets/pipelines/formating.py
@@ -0,0 +1,700 @@
+from collections.abc import Sequence
+
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from mmcv.core.bbox.structures.base_box3d import BaseInstance3DBoxes
+from mmcv.core.points import BasePoints
+from mmcv.utils import is_str
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+ """Convert objects of various python types to :obj:`torch.Tensor`.
+
+ Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+ :class:`Sequence`, :class:`int` and :class:`float`.
+
+ Args:
+ data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+ be converted.
+ """
+
+ if isinstance(data, torch.Tensor):
+ return data
+ elif isinstance(data, np.ndarray):
+ return torch.from_numpy(data)
+ elif isinstance(data, Sequence) and not is_str(data):
+ return torch.tensor(data)
+ elif isinstance(data, int):
+ return torch.LongTensor([data])
+ elif isinstance(data, float):
+ return torch.FloatTensor([data])
+ else:
+ raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class ToTensor:
+ """Convert some results to :obj:`torch.Tensor` by given keys.
+
+ Args:
+ keys (Sequence[str]): Keys that need to be converted to Tensor.
+ """
+
+ def __init__(self, keys):
+ self.keys = keys
+
+ def __call__(self, results):
+ """Call function to convert data in results to :obj:`torch.Tensor`.
+
+ Args:
+ results (dict): Result dict contains the data to convert.
+
+ Returns:
+ dict: The result dict contains the data converted
+ to :obj:`torch.Tensor`.
+ """
+ for key in self.keys:
+ results[key] = to_tensor(results[key])
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor:
+ """Convert image to :obj:`torch.Tensor` by given keys.
+
+ The dimension order of input image is (H, W, C). The pipeline will convert
+ it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+ (1, H, W).
+
+ Args:
+ keys (Sequence[str]): Key of images to be converted to Tensor.
+ """
+
+ def __init__(self, keys):
+ self.keys = keys
+
+ def __call__(self, results):
+ """Call function to convert image in results to :obj:`torch.Tensor` and
+ transpose the channel order.
+
+ Args:
+ results (dict): Result dict contains the image data to convert.
+
+ Returns:
+ dict: The result dict contains the image converted
+ to :obj:`torch.Tensor` and transposed to (C, H, W) order.
+ """
+ for key in self.keys:
+ img = results[key]
+ if len(img.shape) < 3:
+ img = np.expand_dims(img, -1)
+ results[key] = to_tensor(img.transpose(2, 0, 1))
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose:
+ """Transpose some results by given keys.
+
+ Args:
+ keys (Sequence[str]): Keys of results to be transposed.
+ order (Sequence[int]): Order of transpose.
+ """
+
+ def __init__(self, keys, order):
+ self.keys = keys
+ self.order = order
+
+ def __call__(self, results):
+ """Call function to transpose the channel order of data in results.
+
+ Args:
+ results (dict): Result dict contains the data to transpose.
+
+ Returns:
+ dict: The result dict contains the data transposed to \
+ ``self.order``.
+ """
+ for key in self.keys:
+ results[key] = results[key].transpose(self.order)
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + \
+ f'(keys={self.keys}, order={self.order})'
+
+
+@PIPELINES.register_module()
+class ToDataContainer:
+ """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+ Args:
+ fields (Sequence[dict]): Each field is a dict like
+ ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+ be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+ Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
+ dict(key='gt_labels'))``.
+ """
+
+ def __init__(self,
+ fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+ dict(key='gt_labels'))):
+ self.fields = fields
+
+ def __call__(self, results):
+ """Call function to convert data in results to
+ :obj:`mmcv.DataContainer`.
+
+ Args:
+ results (dict): Result dict contains the data to convert.
+
+ Returns:
+ dict: The result dict contains the data converted to \
+ :obj:`mmcv.DataContainer`.
+ """
+
+ for field in self.fields:
+ field = field.copy()
+ key = field.pop('key')
+ results[key] = DC(results[key], **field)
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle:
+ """Default formatting bundle.
+
+ It simplifies the pipeline of formatting common fields, including "img",
+ "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+ These fields are formatted as follows.
+
+ - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+ - proposals: (1)to tensor, (2)to DataContainer
+ - gt_bboxes: (1)to tensor, (2)to DataContainer
+ - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+ - gt_labels: (1)to tensor, (2)to DataContainer
+ - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+ - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+ (3)to DataContainer (stack=True)
+ """
+
+ def __call__(self, results):
+ """Call function to transform and format common fields in results.
+
+ Args:
+ results (dict): Result dict contains the data to convert.
+
+ Returns:
+ dict: The result dict contains the data that is formatted with \
+ default bundle.
+ """
+
+ if 'img' in results:
+ img = results['img']
+ # add default meta keys
+ results = self._add_default_meta_keys(results)
+ if len(img.shape) < 3:
+ img = np.expand_dims(img, -1)
+ img = np.ascontiguousarray(img.transpose(2, 0, 1))
+ results['img'] = DC(to_tensor(img), stack=True)
+ for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels']:
+ if key not in results:
+ continue
+ results[key] = DC(to_tensor(results[key]))
+ if 'gt_masks' in results:
+ results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+ if 'gt_semantic_seg' in results:
+ results['gt_semantic_seg'] = DC(
+ to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+ return results
+
+ def _add_default_meta_keys(self, results):
+ """Add default meta keys.
+
+ We set default meta keys including `pad_shape`, `scale_factor` and
+ `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+ `Pad` are implemented during the whole pipeline.
+
+ Args:
+ results (dict): Result dict contains the data to convert.
+
+ Returns:
+ results (dict): Updated result dict contains the data to convert.
+ """
+ img = results['img']
+ results.setdefault('pad_shape', img.shape)
+ results.setdefault('scale_factor', 1.0)
+ num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+ results.setdefault(
+ 'img_norm_cfg',
+ dict(
+ mean=np.zeros(num_channels, dtype=np.float32),
+ std=np.ones(num_channels, dtype=np.float32),
+ to_rgb=False))
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect:
+ """Collect data from the loader relevant to the specific task.
+
+ This is usually the last stage of the data loader pipeline. Typically keys
+ is set to some subset of "img", "proposals", "gt_bboxes",
+ "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+ The "img_meta" item is always populated. The contents of the "img_meta"
+ dictionary depends on "meta_keys". By default this includes:
+
+ - "img_shape": shape of the image input to the network as a tuple \
+ (h, w, c). Note that images may be zero padded on the \
+ bottom/right if the batch tensor is larger than this shape.
+
+ - "scale_factor": a float indicating the preprocessing scale
+
+ - "flip": a boolean indicating if image flip transform was used
+
+ - "filename": path to the image file
+
+ - "ori_shape": original shape of the image as a tuple (h, w, c)
+
+ - "pad_shape": image shape after padding
+
+ - "img_norm_cfg": a dict of normalization information:
+
+ - mean - per channel mean subtraction
+ - std - per channel std divisor
+ - to_rgb - bool indicating if bgr was converted to rgb
+
+ Args:
+ keys (Sequence[str]): Keys of results to be collected in ``data``.
+ meta_keys (Sequence[str], optional): Meta keys to be converted to
+ ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+ Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape',
+ 'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+ 'img_norm_cfg')``
+ """
+
+ def __init__(self,
+ keys,
+ meta_keys=('filename', 'ori_filename', 'ori_shape',
+ 'img_shape', 'pad_shape', 'scale_factor', 'flip',
+ 'flip_direction', 'img_norm_cfg')):
+ self.keys = keys
+ self.meta_keys = meta_keys
+
+ def __call__(self, results):
+ """Call function to collect keys in results. The keys in ``meta_keys``
+ will be converted to :obj:mmcv.DataContainer.
+
+ Args:
+ results (dict): Result dict contains the data to collect.
+
+ Returns:
+ dict: The result dict contains the following keys
+
+ - keys in``self.keys``
+ - ``img_metas``
+ """
+
+ data = {}
+ img_meta = {}
+ for key in self.meta_keys:
+ img_meta[key] = results[key]
+ data['img_metas'] = DC(img_meta, cpu_only=True)
+ for key in self.keys:
+ data[key] = results[key]
+ return data
+
+ def __repr__(self):
+ return self.__class__.__name__ + \
+ f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class WrapFieldsToLists:
+ """Wrap fields of the data dictionary into lists for evaluation.
+
+ This class can be used as a last step of a test or validation
+ pipeline for single image evaluation or inference.
+
+ Example:
+ >>> test_pipeline = [
+ >>> dict(type='LoadImageFromFile'),
+ >>> dict(type='Normalize',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ to_rgb=True),
+ >>> dict(type='Pad', size_divisor=32),
+ >>> dict(type='ImageToTensor', keys=['img']),
+ >>> dict(type='Collect', keys=['img']),
+ >>> dict(type='WrapFieldsToLists')
+ >>> ]
+ """
+
+ def __call__(self, results):
+ """Call function to wrap fields into lists.
+
+ Args:
+ results (dict): Result dict contains the data to wrap.
+
+ Returns:
+ dict: The result dict where value of ``self.keys`` are wrapped \
+ into list.
+ """
+
+ # Wrap dict fields into lists
+ for key, val in results.items():
+ results[key] = [val]
+ return results
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}()'
+
+
+PIPELINES._module_dict.pop('DefaultFormatBundle')
+
+@PIPELINES.register_module()
+class DefaultFormatBundle(object):
+ """Default formatting bundle.
+
+ It simplifies the pipeline of formatting common fields, including "img",
+ "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+ These fields are formatted as follows.
+
+ - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+ - proposals: (1)to tensor, (2)to DataContainer
+ - gt_bboxes: (1)to tensor, (2)to DataContainer
+ - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+ - gt_labels: (1)to tensor, (2)to DataContainer
+ - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+ - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+ (3)to DataContainer (stack=True)
+ """
+
+ def __init__(self, ):
+ return
+
+ def __call__(self, results):
+ """Call function to transform and format common fields in results.
+
+ Args:
+ results (dict): Result dict contains the data to convert.
+
+ Returns:
+ dict: The result dict contains the data that is formatted with
+ default bundle.
+ """
+ if 'img' in results:
+ if isinstance(results['img'], list):
+ # process multiple imgs in single frame
+ imgs = [img.transpose(2, 0, 1) for img in results['img']]
+ imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
+ results['img'] = DC(to_tensor(imgs), stack=True)
+ else:
+ img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
+ results['img'] = DC(to_tensor(img), stack=True)
+ for key in [
+ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+ 'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
+ 'pts_semantic_mask', 'centers2d', 'depths'
+ ]:
+ if key not in results:
+ continue
+ if isinstance(results[key], list):
+ results[key] = DC([to_tensor(res) for res in results[key]])
+ else:
+ results[key] = DC(to_tensor(results[key]))
+ if 'gt_bboxes_3d' in results:
+ if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+ results['gt_bboxes_3d'] = DC(
+ results['gt_bboxes_3d'], cpu_only=True)
+ else:
+ results['gt_bboxes_3d'] = DC(
+ to_tensor(results['gt_bboxes_3d']))
+
+ if 'gt_masks' in results:
+ results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+ if 'gt_semantic_seg' in results:
+ results['gt_semantic_seg'] = DC(
+ to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect3D(object):
+ """Collect data from the loader relevant to the specific task.
+
+ This is usually the last stage of the data loader pipeline. Typically keys
+ is set to some subset of "img", "proposals", "gt_bboxes",
+ "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+ The "img_meta" item is always populated. The contents of the "img_meta"
+ dictionary depends on "meta_keys". By default this includes:
+
+ - 'img_shape': shape of the image input to the network as a tuple \
+ (h, w, c). Note that images may be zero padded on the \
+ bottom/right if the batch tensor is larger than this shape.
+ - 'scale_factor': a float indicating the preprocessing scale
+ - 'flip': a boolean indicating if image flip transform was used
+ - 'filename': path to the image file
+ - 'ori_shape': original shape of the image as a tuple (h, w, c)
+ - 'pad_shape': image shape after padding
+ - 'lidar2img': transform from lidar to image
+ - 'depth2img': transform from depth to image
+ - 'cam2img': transform from camera to image
+ - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+ flipped horizontally
+ - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+ flipped vertically
+ - 'box_mode_3d': 3D box mode
+ - 'box_type_3d': 3D box type
+ - 'img_norm_cfg': a dict of normalization information:
+ - mean: per channel mean subtraction
+ - std: per channel std divisor
+ - to_rgb: bool indicating if bgr was converted to rgb
+ - 'pcd_trans': point cloud transformations
+ - 'sample_idx': sample index
+ - 'pcd_scale_factor': point cloud scale factor
+ - 'pcd_rotation': rotation applied to point cloud
+ - 'pts_filename': path to point cloud file.
+
+ Args:
+ keys (Sequence[str]): Keys of results to be collected in ``data``.
+ meta_keys (Sequence[str], optional): Meta keys to be converted to
+ ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+ Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+ 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+ 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+ 'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+ 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+ """
+
+ def __init__(self,
+ keys,
+ meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+ 'depth2img', 'cam2img', 'pad_shape',
+ 'scale_factor', 'flip', 'pcd_horizontal_flip',
+ 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+ 'img_norm_cfg', 'pcd_trans', 'sample_idx',
+ 'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+ 'transformation_3d_flow')):
+ self.keys = keys
+ self.meta_keys = meta_keys
+
+ def __call__(self, results):
+ """Call function to collect keys in results. The keys in ``meta_keys``
+ will be converted to :obj:`mmcv.DataContainer`.
+
+ Args:
+ results (dict): Result dict contains the data to collect.
+
+ Returns:
+ dict: The result dict contains the following keys
+ - keys in ``self.keys``
+ - ``img_metas``
+ """
+ data = {}
+ img_metas = {}
+ for key in self.meta_keys:
+ if key in results:
+ img_metas[key] = results[key]
+
+ data['img_metas'] = DC(img_metas, cpu_only=True)
+ for key in self.keys:
+ data[key] = results[key]
+ return data
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ return self.__class__.__name__ + \
+ f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle3D(DefaultFormatBundle):
+ """Default formatting bundle.
+
+ It simplifies the pipeline of formatting common fields for voxels,
+ including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+ "gt_semantic_seg".
+ These fields are formatted as follows.
+
+ - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+ - proposals: (1)to tensor, (2)to DataContainer
+ - gt_bboxes: (1)to tensor, (2)to DataContainer
+ - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+ - gt_labels: (1)to tensor, (2)to DataContainer
+ """
+
+ def __init__(self, class_names, with_gt=True, with_label=True):
+ super(DefaultFormatBundle3D, self).__init__()
+ self.class_names = class_names
+ self.with_gt = with_gt
+ self.with_label = with_label
+
+ def __call__(self, results):
+ """Call function to transform and format common fields in results.
+
+ Args:
+ results (dict): Result dict contains the data to convert.
+
+ Returns:
+ dict: The result dict contains the data that is formatted with
+ default bundle.
+ """
+ # Format 3D data
+ if 'points' in results:
+ assert isinstance(results['points'], BasePoints)
+ results['points'] = DC(results['points'].tensor)
+
+ for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
+ if key not in results:
+ continue
+ results[key] = DC(to_tensor(results[key]), stack=False)
+
+ if self.with_gt:
+ # Clean GT bboxes in the final
+ if 'gt_bboxes_3d_mask' in results:
+ gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
+ results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
+ gt_bboxes_3d_mask]
+ if 'gt_names_3d' in results:
+ results['gt_names_3d'] = results['gt_names_3d'][
+ gt_bboxes_3d_mask]
+ if 'centers2d' in results:
+ results['centers2d'] = results['centers2d'][
+ gt_bboxes_3d_mask]
+ if 'depths' in results:
+ results['depths'] = results['depths'][gt_bboxes_3d_mask]
+ if 'gt_bboxes_mask' in results:
+ gt_bboxes_mask = results['gt_bboxes_mask']
+ if 'gt_bboxes' in results:
+ results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
+ results['gt_names'] = results['gt_names'][gt_bboxes_mask]
+ if self.with_label:
+ if 'gt_names' in results and len(results['gt_names']) == 0:
+ results['gt_labels'] = np.array([], dtype=np.int64)
+ results['attr_labels'] = np.array([], dtype=np.int64)
+ elif 'gt_names' in results and isinstance(
+ results['gt_names'][0], list):
+ # gt_labels might be a list of list in multi-view setting
+ results['gt_labels'] = [
+ np.array([self.class_names.index(n) for n in res],
+ dtype=np.int64) for res in results['gt_names']
+ ]
+ elif 'gt_names' in results:
+ results['gt_labels'] = np.array([
+ self.class_names.index(n) for n in results['gt_names']
+ ],
+ dtype=np.int64)
+ # we still assume one pipeline for one frame LiDAR
+ # thus, the 3D name is list[string]
+ if 'gt_names_3d' in results:
+ results['gt_labels_3d'] = np.array([
+ self.class_names.index(n)
+ for n in results['gt_names_3d']
+ ],
+ dtype=np.int64)
+ results = super(DefaultFormatBundle3D, self).__call__(results)
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(class_names={self.class_names}, '
+ repr_str += f'with_gt={self.with_gt}, with_label={self.with_label})'
+ return repr_str
+
+@PIPELINES.register_module()
+class CustomDefaultFormatBundle3D(DefaultFormatBundle3D):
+ """Default formatting bundle.
+ It simplifies the pipeline of formatting common fields for voxels,
+ including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+ "gt_semantic_seg".
+ These fields are formatted as follows.
+ - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+ - proposals: (1)to tensor, (2)to DataContainer
+ - gt_bboxes: (1)to tensor, (2)to DataContainer
+ - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+ - gt_labels: (1)to tensor, (2)to DataContainer
+ """
+
+ def __call__(self, results):
+ """Call function to transform and format common fields in results.
+ Args:
+ results (dict): Result dict contains the data to convert.
+ Returns:
+ dict: The result dict contains the data that is formatted with
+ default bundle.
+ """
+ # Format 3D data
+ results = super(CustomDefaultFormatBundle3D, self).__call__(results)
+ results['gt_map_masks'] = DC(
+ to_tensor(results['gt_map_masks']), stack=True)
+
+ return results
+
+@PIPELINES.register_module()
+class VADFormatBundle3D(DefaultFormatBundle3D):
+ """Default formatting bundle.
+ It simplifies the pipeline of formatting common fields for voxels,
+ including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+ "gt_semantic_seg".
+ These fields are formatted as follows.
+ - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+ - proposals: (1)to tensor, (2)to DataContainer
+ - gt_bboxes: (1)to tensor, (2)to DataContainer
+ - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+ - gt_labels: (1)to tensor, (2)to DataContainer
+ """
+ def __init__(self, class_names, with_gt=True, with_label=True, with_ego=True):
+ super(VADFormatBundle3D, self).__init__(class_names, with_gt, with_label)
+ self.with_ego = with_ego
+
+
+ def __call__(self, results):
+ """Call function to transform and format common fields in results.
+ Args:
+ results (dict): Result dict contains the data to convert.
+ Returns:
+ dict: The result dict contains the data that is formatted with
+ default bundle.
+ """
+ # Format 3D data
+ results = super(VADFormatBundle3D, self).__call__(results)
+ # results['gt_map_masks'] = DC(to_tensor(results['gt_map_masks']), stack=True)
+ if self.with_ego:
+ if 'ego_his_trajs' in results:
+ results['ego_his_trajs'] = DC(to_tensor(results['ego_his_trajs'][None, ...]), stack=True)
+ if 'ego_fut_trajs' in results:
+ results['ego_fut_trajs'] = DC(to_tensor(results['ego_fut_trajs'][None, ...]), stack=True)
+ if 'ego_fut_masks' in results:
+ results['ego_fut_masks'] = DC(to_tensor(results['ego_fut_masks'][None, None, ...]), stack=True)
+ if 'ego_fut_cmd' in results:
+ results['ego_fut_cmd'] = DC(to_tensor(results['ego_fut_cmd'][None, None, ...]), stack=True)
+ if 'ego_lcf_feat' in results:
+ results['ego_lcf_feat'] = DC(to_tensor(results['ego_lcf_feat'][None, None, ...]), stack=True)
+ if 'gt_attr_labels' in results:
+ results['gt_attr_labels'] = DC(to_tensor(results['gt_attr_labels']), cpu_only=False)
+
+ return results
+
diff --git a/mmcv/datasets/pipelines/loading.py b/mmcv/datasets/pipelines/loading.py
new file mode 100644
index 0000000..dbf494e
--- /dev/null
+++ b/mmcv/datasets/pipelines/loading.py
@@ -0,0 +1,1709 @@
+import os
+import os.path as osp
+import torch
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from einops import rearrange
+from mmcv.core.points import BasePoints, get_points_type
+from mmcv.fileio.file_client import FileClient
+from mmcv.image import imfrombytes, imread
+from mmcv.utils import check_file_exist
+from mmcv.core.mask.structures import BitmapMasks, PolygonMasks
+# from mmcv.datasets.pipelines.loading import LoadAnnotations, LoadImageFromFile
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile:
+ """Load an image from file.
+
+ Required keys are "img_prefix" and "img_info" (a dict that must contain the
+ key "filename"). Added or updated keys are "filename", "img", "img_shape",
+ "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+ "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+ Args:
+ to_float32 (bool): Whether to convert the loaded image to a float32
+ numpy array. If set to False, the loaded image is an uint8 array.
+ Defaults to False.
+ color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+ Defaults to 'color'.
+ file_client_args (dict): Arguments to instantiate a FileClient.
+ See :class:`mmcv.fileio.FileClient` for details.
+ Defaults to ``dict(backend='disk')``.
+ """
+
+ def __init__(self,
+ to_float32=False,
+ color_type='color',
+ file_client_args=dict(backend='disk')):
+ self.to_float32 = to_float32
+ self.color_type = color_type
+ self.file_client_args = file_client_args.copy()
+ self.file_client = None
+
+ def __call__(self, results):
+ """Call functions to load image and get image meta information.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded image and meta information.
+ """
+
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+
+ if results['img_prefix'] is not None:
+ filename = osp.join(results['img_prefix'],
+ results['img_info']['filename'])
+ else:
+ filename = results['img_info']['filename']
+
+ img_bytes = self.file_client.get(filename)
+ img = imfrombytes(img_bytes, flag=self.color_type)
+ if self.to_float32:
+ img = img.astype(np.float32)
+
+ results['filename'] = filename
+ results['ori_filename'] = results['img_info']['filename']
+ results['img'] = img
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ results['img_fields'] = ['img']
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'to_float32={self.to_float32}, '
+ f"color_type='{self.color_type}', "
+ f'file_client_args={self.file_client_args})')
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromWebcam(LoadImageFromFile):
+ """Load an image from webcam.
+
+ Similar with :obj:`LoadImageFromFile`, but the image read from webcam is in
+ ``results['img']``.
+ """
+
+ def __call__(self, results):
+ """Call functions to add image meta information.
+
+ Args:
+ results (dict): Result dict with Webcam read image in
+ ``results['img']``.
+
+ Returns:
+ dict: The dict contains loaded image and meta information.
+ """
+
+ img = results['img']
+ if self.to_float32:
+ img = img.astype(np.float32)
+
+ results['filename'] = None
+ results['ori_filename'] = None
+ results['img'] = img
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ results['img_fields'] = ['img']
+ return results
+
+
+@PIPELINES.register_module()
+class LoadMultiChannelImageFromFiles:
+ """Load multi-channel images from a list of separate channel files.
+
+ Required keys are "img_prefix" and "img_info" (a dict that must contain the
+ key "filename", which is expected to be a list of filenames).
+ Added or updated keys are "filename", "img", "img_shape",
+ "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+ "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+ Args:
+ to_float32 (bool): Whether to convert the loaded image to a float32
+ numpy array. If set to False, the loaded image is an uint8 array.
+ Defaults to False.
+ color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+ Defaults to 'color'.
+ file_client_args (dict): Arguments to instantiate a FileClient.
+ See :class:`mmcv.fileio.FileClient` for details.
+ Defaults to ``dict(backend='disk')``.
+ """
+
+ def __init__(self,
+ to_float32=False,
+ color_type='unchanged',
+ file_client_args=dict(backend='disk')):
+ self.to_float32 = to_float32
+ self.color_type = color_type
+ self.file_client_args = file_client_args.copy()
+ self.file_client = None
+
+ def __call__(self, results):
+ """Call functions to load multiple images and get images meta
+ information.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded images and meta information.
+ """
+
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+
+ if results['img_prefix'] is not None:
+ filename = [
+ osp.join(results['img_prefix'], fname)
+ for fname in results['img_info']['filename']
+ ]
+ else:
+ filename = results['img_info']['filename']
+
+ img = []
+ for name in filename:
+ img_bytes = self.file_client.get(name)
+ img.append(imfrombytes(img_bytes, flag=self.color_type))
+ img = np.stack(img, axis=-1)
+ if self.to_float32:
+ img = img.astype(np.float32)
+
+ results['filename'] = filename
+ results['ori_filename'] = results['img_info']['filename']
+ results['img'] = img
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ # Set initial values for default meta_keys
+ results['pad_shape'] = img.shape
+ results['scale_factor'] = 1.0
+ num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+ results['img_norm_cfg'] = dict(
+ mean=np.zeros(num_channels, dtype=np.float32),
+ std=np.ones(num_channels, dtype=np.float32),
+ to_rgb=False)
+ return results
+
+ def __repr__(self):
+ repr_str = (f'{self.__class__.__name__}('
+ f'to_float32={self.to_float32}, '
+ f"color_type='{self.color_type}', "
+ f'file_client_args={self.file_client_args})')
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations:
+ """Load multiple types of annotations.
+
+ Args:
+ with_bbox (bool): Whether to parse and load the bbox annotation.
+ Default: True.
+ with_label (bool): Whether to parse and load the label annotation.
+ Default: True.
+ with_mask (bool): Whether to parse and load the mask annotation.
+ Default: False.
+ with_seg (bool): Whether to parse and load the semantic segmentation
+ annotation. Default: False.
+ poly2mask (bool): Whether to convert the instance masks from polygons
+ to bitmaps. Default: True.
+ file_client_args (dict): Arguments to instantiate a FileClient.
+ See :class:`mmcv.fileio.FileClient` for details.
+ Defaults to ``dict(backend='disk')``.
+ """
+
+ def __init__(self,
+ with_bbox=True,
+ with_label=True,
+ with_mask=False,
+ with_seg=False,
+ poly2mask=True,
+ file_client_args=dict(backend='disk')):
+ self.with_bbox = with_bbox
+ self.with_label = with_label
+ self.with_mask = with_mask
+ self.with_seg = with_seg
+ self.poly2mask = poly2mask
+ self.file_client_args = file_client_args.copy()
+ self.file_client = None
+
+ def _load_bboxes(self, results):
+ """Private function to load bounding box annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded bounding box annotations.
+ """
+
+ ann_info = results['ann_info']
+ results['gt_bboxes'] = ann_info['bboxes'].copy()
+
+ gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+ if gt_bboxes_ignore is not None:
+ results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+ results['bbox_fields'].append('gt_bboxes_ignore')
+ results['bbox_fields'].append('gt_bboxes')
+ return results
+
+ def _load_labels(self, results):
+ """Private function to load label annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded label annotations.
+ """
+
+ results['gt_labels'] = results['ann_info']['labels'].copy()
+ return results
+
+ def _poly2mask(self, mask_ann, img_h, img_w):
+ """Private function to convert masks represented with polygon to
+ bitmaps.
+
+ Args:
+ mask_ann (list | dict): Polygon mask annotation input.
+ img_h (int): The height of output mask.
+ img_w (int): The width of output mask.
+
+ Returns:
+ numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+ """
+
+ if isinstance(mask_ann, list):
+ # polygon -- a single object might consist of multiple parts
+ # we merge all parts into one mask rle code
+ rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+ rle = maskUtils.merge(rles)
+ elif isinstance(mask_ann['counts'], list):
+ # uncompressed RLE
+ rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+ else:
+ # rle
+ rle = mask_ann
+ mask = maskUtils.decode(rle)
+ return mask
+
+ def process_polygons(self, polygons):
+ """Convert polygons to list of ndarray and filter invalid polygons.
+
+ Args:
+ polygons (list[list]): Polygons of one instance.
+
+ Returns:
+ list[numpy.ndarray]: Processed polygons.
+ """
+
+ polygons = [np.array(p) for p in polygons]
+ valid_polygons = []
+ for polygon in polygons:
+ if len(polygon) % 2 == 0 and len(polygon) >= 6:
+ valid_polygons.append(polygon)
+ return valid_polygons
+
+ def _load_masks(self, results):
+ """Private function to load mask annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded mask annotations.
+ If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+ :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+ """
+
+ h, w = results['img_info']['height'], results['img_info']['width']
+ gt_masks = results['ann_info']['masks']
+ if self.poly2mask:
+ gt_masks = BitmapMasks(
+ [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+ else:
+ gt_masks = PolygonMasks(
+ [self.process_polygons(polygons) for polygons in gt_masks], h,
+ w)
+ results['gt_masks'] = gt_masks
+ results['mask_fields'].append('gt_masks')
+ return results
+
+ def _load_semantic_seg(self, results):
+ """Private function to load semantic segmentation annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`dataset`.
+
+ Returns:
+ dict: The dict contains loaded semantic segmentation annotations.
+ """
+
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+
+ filename = osp.join(results['seg_prefix'],
+ results['ann_info']['seg_map'])
+ img_bytes = self.file_client.get(filename)
+ results['gt_semantic_seg'] = imfrombytes(
+ img_bytes, flag='unchanged').squeeze()
+ results['seg_fields'].append('gt_semantic_seg')
+ return results
+
+ def __call__(self, results):
+ """Call function to load multiple types annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded bounding box, label, mask and
+ semantic segmentation annotations.
+ """
+
+ if self.with_bbox:
+ results = self._load_bboxes(results)
+ if results is None:
+ return None
+ if self.with_label:
+ results = self._load_labels(results)
+ if self.with_mask:
+ results = self._load_masks(results)
+ if self.with_seg:
+ results = self._load_semantic_seg(results)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(with_bbox={self.with_bbox}, '
+ repr_str += f'with_label={self.with_label}, '
+ repr_str += f'with_mask={self.with_mask}, '
+ repr_str += f'with_seg={self.with_seg}, '
+ repr_str += f'poly2mask={self.poly2mask}, '
+ repr_str += f'poly2mask={self.file_client_args})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadProposals:
+ """Load proposal pipeline.
+
+ Required key is "proposals". Updated keys are "proposals", "bbox_fields".
+
+ Args:
+ num_max_proposals (int, optional): Maximum number of proposals to load.
+ If not specified, all proposals will be loaded.
+ """
+
+ def __init__(self, num_max_proposals=None):
+ self.num_max_proposals = num_max_proposals
+
+ def __call__(self, results):
+ """Call function to load proposals from file.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded proposal annotations.
+ """
+
+ proposals = results['proposals']
+ if proposals.shape[1] not in (4, 5):
+ raise AssertionError(
+ 'proposals should have shapes (n, 4) or (n, 5), '
+ f'but found {proposals.shape}')
+ proposals = proposals[:, :4]
+
+ if self.num_max_proposals is not None:
+ proposals = proposals[:self.num_max_proposals]
+
+ if len(proposals) == 0:
+ proposals = np.array([[0, 0, 0, 0]], dtype=np.float32)
+ results['proposals'] = proposals
+ results['bbox_fields'].append('proposals')
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + \
+ f'(num_max_proposals={self.num_max_proposals})'
+
+
+@PIPELINES.register_module()
+class FilterAnnotations:
+ """Filter invalid annotations.
+
+ Args:
+ min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth
+ boxes.
+ """
+
+ def __init__(self, min_gt_bbox_wh):
+ # TODO: add more filter options
+ self.min_gt_bbox_wh = min_gt_bbox_wh
+
+ def __call__(self, results):
+ assert 'gt_bboxes' in results
+ gt_bboxes = results['gt_bboxes']
+ w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+ h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+ keep = (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1])
+ if not keep.any():
+ return None
+ else:
+ keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg')
+ for key in keys:
+ if key in results:
+ results[key] = results[key][keep]
+ return results
+
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFiles(object):
+ """Load multi channel images from a list of separate channel files.
+
+ Expects results['img_filename'] to be a list of filenames.
+
+ Args:
+ to_float32 (bool): Whether to convert the img to float32.
+ Defaults to False.
+ color_type (str): Color type of the file. Defaults to 'unchanged'.
+ """
+
+ def __init__(self, to_float32=False, color_type='unchanged'):
+ self.to_float32 = to_float32
+ self.color_type = color_type
+
+ def __call__(self, results):
+ """Call function to load multi-view image from files.
+
+ Args:
+ results (dict): Result dict containing multi-view image filenames.
+
+ Returns:
+ dict: The result dict containing the multi-view image data. \
+ Added keys and values are described below.
+
+ - filename (str): Multi-view image filenames.
+ - img (np.ndarray): Multi-view image arrays.
+ - img_shape (tuple[int]): Shape of multi-view image arrays.
+ - ori_shape (tuple[int]): Shape of original image arrays.
+ - pad_shape (tuple[int]): Shape of padded image arrays.
+ - scale_factor (float): Scale factor.
+ - img_norm_cfg (dict): Normalization configuration of images.
+ """
+ filename = results['img_filename']
+ # img is of shape (h, w, c, num_views)
+ img = np.stack(
+ [imread(name, self.color_type) for name in filename], axis=-1)
+ if self.to_float32:
+ img = img.astype(np.float32)
+ results['filename'] = filename
+ # unravel to list, see `DefaultFormatBundle` in formating.py
+ # which will transpose each image separately and then stack into array
+ results['img'] = [img[..., i] for i in range(img.shape[-1])]
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ # Set initial values for default meta_keys
+ results['pad_shape'] = img.shape
+ results['scale_factor'] = 1.0
+ num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+ results['img_norm_cfg'] = dict(
+ mean=np.zeros(num_channels, dtype=np.float32),
+ std=np.ones(num_channels, dtype=np.float32),
+ to_rgb=False)
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(to_float32={self.to_float32}, '
+ repr_str += f"color_type='{self.color_type}')"
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromFileMono3D(LoadImageFromFile):
+ """Load an image from file in monocular 3D object detection. Compared to 2D
+ detection, additional camera parameters need to be loaded.
+
+ Args:
+ kwargs (dict): Arguments are the same as those in \
+ :class:`LoadImageFromFile`.
+ """
+
+ def __call__(self, results):
+ """Call functions to load image and get image meta information.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+ Returns:
+ dict: The dict contains loaded image and meta information.
+ """
+ super().__call__(results)
+ results['cam2img'] = results['img_info']['cam_intrinsic']
+ return results
+
+
+@PIPELINES.register_module()
+class LoadPointsFromMultiSweeps(object):
+ """Load points from multiple sweeps.
+
+ This is usually used for nuScenes dataset to utilize previous sweeps.
+
+ Args:
+ sweeps_num (int): Number of sweeps. Defaults to 10.
+ load_dim (int): Dimension number of the loaded points. Defaults to 5.
+ use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+ file_client_args (dict): Config dict of file clients, refer to
+ https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+ for more details. Defaults to dict(backend='disk').
+ pad_empty_sweeps (bool): Whether to repeat keyframe when
+ sweeps is empty. Defaults to False.
+ remove_close (bool): Whether to remove close points.
+ Defaults to False.
+ test_mode (bool): If test_model=True used for testing, it will not
+ randomly sample sweeps but select the nearest N frames.
+ Defaults to False.
+ """
+
+ def __init__(self,
+ sweeps_num=10,
+ load_dim=5,
+ use_dim=[0, 1, 2, 4],
+ file_client_args=dict(backend='disk'),
+ pad_empty_sweeps=False,
+ remove_close=False,
+ test_mode=False):
+ self.load_dim = load_dim
+ self.sweeps_num = sweeps_num
+ self.use_dim = use_dim
+ self.file_client_args = file_client_args.copy()
+ self.file_client = None
+ self.pad_empty_sweeps = pad_empty_sweeps
+ self.remove_close = remove_close
+ self.test_mode = test_mode
+
+ def _load_points(self, pts_filename):
+ """Private function to load point clouds data.
+
+ Args:
+ pts_filename (str): Filename of point clouds data.
+
+ Returns:
+ np.ndarray: An array containing point clouds data.
+ """
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+ try:
+ pts_bytes = self.file_client.get(pts_filename)
+ points = np.frombuffer(pts_bytes, dtype=np.float32)
+ except ConnectionError:
+ check_file_exist(pts_filename)
+ if pts_filename.endswith('.npy'):
+ points = np.load(pts_filename)
+ else:
+ points = np.fromfile(pts_filename, dtype=np.float32)
+ return points
+
+ def _remove_close(self, points, radius=1.0):
+ """Removes point too close within a certain radius from origin.
+
+ Args:
+ points (np.ndarray | :obj:`BasePoints`): Sweep points.
+ radius (float): Radius below which points are removed.
+ Defaults to 1.0.
+
+ Returns:
+ np.ndarray: Points after removing.
+ """
+ if isinstance(points, np.ndarray):
+ points_numpy = points
+ elif isinstance(points, BasePoints):
+ points_numpy = points.tensor.numpy()
+ else:
+ raise NotImplementedError
+ x_filt = np.abs(points_numpy[:, 0]) < radius
+ y_filt = np.abs(points_numpy[:, 1]) < radius
+ not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+ return points[not_close]
+
+ def __call__(self, results):
+ """Call function to load multi-sweep point clouds from files.
+
+ Args:
+ results (dict): Result dict containing multi-sweep point cloud \
+ filenames.
+
+ Returns:
+ dict: The result dict containing the multi-sweep points data. \
+ Added key and value are described below.
+
+ - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \
+ cloud arrays.
+ """
+ points = results['points']
+ points.tensor[:, 4] = 0
+ sweep_points_list = [points]
+ ts = results['timestamp']
+ if self.pad_empty_sweeps and len(results['sweeps']) == 0:
+ for i in range(self.sweeps_num):
+ if self.remove_close:
+ sweep_points_list.append(self._remove_close(points))
+ else:
+ sweep_points_list.append(points)
+ else:
+ if len(results['sweeps']) <= self.sweeps_num:
+ choices = np.arange(len(results['sweeps']))
+ elif self.test_mode:
+ choices = np.arange(self.sweeps_num)
+ else:
+ choices = np.random.choice(
+ len(results['sweeps']), self.sweeps_num, replace=False)
+ for idx in choices:
+ sweep = results['sweeps'][idx]
+ points_sweep = self._load_points(sweep['data_path'])
+ points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+ if self.remove_close:
+ points_sweep = self._remove_close(points_sweep)
+ sweep_ts = sweep['timestamp'] / 1e6
+ points_sweep[:, :3] = points_sweep[:, :3] @ sweep[
+ 'sensor2lidar_rotation'].T
+ points_sweep[:, :3] += sweep['sensor2lidar_translation']
+ points_sweep[:, 4] = ts - sweep_ts
+ points_sweep = points.new_point(points_sweep)
+ sweep_points_list.append(points_sweep)
+
+ points = points.cat(sweep_points_list)
+ points = points[:, self.use_dim]
+ results['points'] = points
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'
+
+
+@PIPELINES.register_module()
+class PointSegClassMapping(object):
+ """Map original semantic class to valid category ids.
+
+ Map valid classes as 0~len(valid_cat_ids)-1 and
+ others as len(valid_cat_ids).
+
+ Args:
+ valid_cat_ids (tuple[int]): A tuple of valid category.
+ max_cat_id (int): The max possible cat_id in input segmentation mask.
+ Defaults to 40.
+ """
+
+ def __init__(self, valid_cat_ids, max_cat_id=40):
+ assert max_cat_id >= np.max(valid_cat_ids), \
+ 'max_cat_id should be greater than maximum id in valid_cat_ids'
+
+ self.valid_cat_ids = valid_cat_ids
+ self.max_cat_id = int(max_cat_id)
+
+ # build cat_id to class index mapping
+ neg_cls = len(valid_cat_ids)
+ self.cat_id2class = np.ones(
+ self.max_cat_id + 1, dtype=np.int) * neg_cls
+ for cls_idx, cat_id in enumerate(valid_cat_ids):
+ self.cat_id2class[cat_id] = cls_idx
+
+ def __call__(self, results):
+ """Call function to map original semantic class to valid category ids.
+
+ Args:
+ results (dict): Result dict containing point semantic masks.
+
+ Returns:
+ dict: The result dict containing the mapped category ids. \
+ Updated key and value are described below.
+
+ - pts_semantic_mask (np.ndarray): Mapped semantic masks.
+ """
+ assert 'pts_semantic_mask' in results
+ pts_semantic_mask = results['pts_semantic_mask']
+
+ converted_pts_sem_mask = self.cat_id2class[pts_semantic_mask]
+
+ results['pts_semantic_mask'] = converted_pts_sem_mask
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(valid_cat_ids={self.valid_cat_ids}, '
+ repr_str += f'max_cat_id={self.max_cat_id})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizePointsColor(object):
+ """Normalize color of points.
+
+ Args:
+ color_mean (list[float]): Mean color of the point cloud.
+ """
+
+ def __init__(self, color_mean):
+ self.color_mean = color_mean
+
+ def __call__(self, results):
+ """Call function to normalize color of points.
+
+ Args:
+ results (dict): Result dict containing point clouds data.
+
+ Returns:
+ dict: The result dict containing the normalized points. \
+ Updated key and value are described below.
+
+ - points (:obj:`BasePoints`): Points after color normalization.
+ """
+ points = results['points']
+ assert points.attribute_dims is not None and \
+ 'color' in points.attribute_dims.keys(), \
+ 'Expect points have color attribute'
+ if self.color_mean is not None:
+ points.color = points.color - \
+ points.color.new_tensor(self.color_mean)
+ points.color = points.color / 255.0
+ results['points'] = points
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(color_mean={self.color_mean})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadPointsFromFile(object):
+ """Load Points From File.
+
+ Load sunrgbd and scannet points from file.
+
+ Args:
+ coord_type (str): The type of coordinates of points cloud.
+ Available options includes:
+ - 'LIDAR': Points in LiDAR coordinates.
+ - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+ - 'CAMERA': Points in camera coordinates.
+ load_dim (int): The dimension of the loaded points.
+ Defaults to 6.
+ use_dim (list[int]): Which dimensions of the points to be used.
+ Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+ or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+ shift_height (bool): Whether to use shifted height. Defaults to False.
+ use_color (bool): Whether to use color features. Defaults to False.
+ file_client_args (dict): Config dict of file clients, refer to
+ https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+ for more details. Defaults to dict(backend='disk').
+ """
+
+ def __init__(self,
+ coord_type,
+ load_dim=6,
+ use_dim=[0, 1, 2],
+ shift_height=False,
+ use_color=False,
+ file_client_args=dict(backend='disk')):
+ self.shift_height = shift_height
+ self.use_color = use_color
+ if isinstance(use_dim, int):
+ use_dim = list(range(use_dim))
+ assert max(use_dim) < load_dim, \
+ f'Expect all used dimensions < {load_dim}, got {use_dim}'
+ assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH']
+
+ self.coord_type = coord_type
+ self.load_dim = load_dim
+ self.use_dim = use_dim
+ self.file_client_args = file_client_args.copy()
+ self.file_client = None
+
+ def _load_points(self, pts_filename):
+ """Private function to load point clouds data.
+
+ Args:
+ pts_filename (str): Filename of point clouds data.
+
+ Returns:
+ np.ndarray: An array containing point clouds data.
+ """
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+ try:
+ pts_bytes = self.file_client.get(pts_filename)
+ points = np.frombuffer(pts_bytes, dtype=np.float32)
+ except ConnectionError:
+ check_file_exist(pts_filename)
+ if pts_filename.endswith('.npy'):
+ points = np.load(pts_filename)
+ else:
+ points = np.fromfile(pts_filename, dtype=np.float32)
+
+ return points
+
+ def __call__(self, results):
+ """Call function to load points data from file.
+
+ Args:
+ results (dict): Result dict containing point clouds data.
+
+ Returns:
+ dict: The result dict containing the point clouds data. \
+ Added key and value are described below.
+
+ - points (:obj:`BasePoints`): Point clouds data.
+ """
+ pts_filename = results['pts_filename']
+ points = self._load_points(pts_filename)
+ points = points.reshape(-1, self.load_dim)
+ points = points[:, self.use_dim]
+ attribute_dims = None
+
+ if self.shift_height:
+ floor_height = np.percentile(points[:, 2], 0.99)
+ height = points[:, 2] - floor_height
+ points = np.concatenate(
+ [points[:, :3],
+ np.expand_dims(height, 1), points[:, 3:]], 1)
+ attribute_dims = dict(height=3)
+
+ if self.use_color:
+ assert len(self.use_dim) >= 6
+ if attribute_dims is None:
+ attribute_dims = dict()
+ attribute_dims.update(
+ dict(color=[
+ points.shape[1] - 3,
+ points.shape[1] - 2,
+ points.shape[1] - 1,
+ ]))
+
+ points_class = get_points_type(self.coord_type)
+ points = points_class(
+ points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
+ results['points'] = points
+
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__ + '('
+ repr_str += f'shift_height={self.shift_height}, '
+ repr_str += f'use_color={self.use_color}, '
+ repr_str += f'file_client_args={self.file_client_args}, '
+ repr_str += f'load_dim={self.load_dim}, '
+ repr_str += f'use_dim={self.use_dim})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations3D(LoadAnnotations):
+ """Load Annotations3D.
+
+ Load instance mask and semantic mask of points and
+ encapsulate the items into related fields.
+
+ Args:
+ with_bbox_3d (bool, optional): Whether to load 3D boxes.
+ Defaults to True.
+ with_label_3d (bool, optional): Whether to load 3D labels.
+ Defaults to True.
+ with_attr_label (bool, optional): Whether to load attribute label.
+ Defaults to False.
+ with_mask_3d (bool, optional): Whether to load 3D instance masks.
+ for points. Defaults to False.
+ with_seg_3d (bool, optional): Whether to load 3D semantic masks.
+ for points. Defaults to False.
+ with_bbox (bool, optional): Whether to load 2D boxes.
+ Defaults to False.
+ with_label (bool, optional): Whether to load 2D labels.
+ Defaults to False.
+ with_mask (bool, optional): Whether to load 2D instance masks.
+ Defaults to False.
+ with_seg (bool, optional): Whether to load 2D semantic masks.
+ Defaults to False.
+ with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+ Defaults to False.
+ poly2mask (bool, optional): Whether to convert polygon annotations
+ to bitmasks. Defaults to True.
+ seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
+ Defaults to int64
+ file_client_args (dict): Config dict of file clients, refer to
+ https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+ for more details.
+ """
+
+ def __init__(self,
+ with_bbox_3d=True,
+ with_label_3d=True,
+ with_attr_label=False,
+ with_mask_3d=False,
+ with_seg_3d=False,
+ with_bbox=False,
+ with_label=False,
+ with_mask=False,
+ with_seg=False,
+ with_bbox_depth=False,
+ poly2mask=True,
+ seg_3d_dtype='int',
+ file_client_args=dict(backend='disk')):
+ super().__init__(
+ with_bbox,
+ with_label,
+ with_mask,
+ with_seg,
+ poly2mask,
+ file_client_args=file_client_args)
+ self.with_bbox_3d = with_bbox_3d
+ self.with_bbox_depth = with_bbox_depth
+ self.with_label_3d = with_label_3d
+ self.with_attr_label = with_attr_label
+ self.with_mask_3d = with_mask_3d
+ self.with_seg_3d = with_seg_3d
+ self.seg_3d_dtype = seg_3d_dtype
+
+ def _load_bboxes_3d(self, results):
+ """Private function to load 3D bounding box annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded 3D bounding box annotations.
+ """
+ results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
+ results['bbox3d_fields'].append('gt_bboxes_3d')
+ return results
+
+ def _load_bboxes_depth(self, results):
+ """Private function to load 2.5D bounding box annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded 2.5D bounding box annotations.
+ """
+ results['centers2d'] = results['ann_info']['centers2d']
+ results['depths'] = results['ann_info']['depths']
+ return results
+
+ def _load_labels_3d(self, results):
+ """Private function to load label annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded label annotations.
+ """
+ results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
+ return results
+
+ def _load_attr_labels(self, results):
+ """Private function to load label annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded label annotations.
+ """
+ results['attr_labels'] = results['ann_info']['attr_labels']
+ return results
+
+ def _load_masks_3d(self, results):
+ """Private function to load 3D mask annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded 3D mask annotations.
+ """
+ pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
+
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+ try:
+ mask_bytes = self.file_client.get(pts_instance_mask_path)
+ pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int)
+ except ConnectionError:
+ check_file_exist(pts_instance_mask_path)
+ pts_instance_mask = np.fromfile(
+ pts_instance_mask_path, dtype=np.long)
+
+ results['pts_instance_mask'] = pts_instance_mask
+ results['pts_mask_fields'].append('pts_instance_mask')
+ return results
+
+ def _load_semantic_seg_3d(self, results):
+ """Private function to load 3D semantic segmentation annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing the semantic segmentation annotations.
+ """
+ pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
+
+ if self.file_client is None:
+ self.file_client = FileClient(**self.file_client_args)
+ try:
+ mask_bytes = self.file_client.get(pts_semantic_mask_path)
+ # add .copy() to fix read-only bug
+ pts_semantic_mask = np.frombuffer(
+ mask_bytes, dtype=self.seg_3d_dtype).copy()
+ except ConnectionError:
+ check_file_exist(pts_semantic_mask_path)
+ pts_semantic_mask = np.fromfile(
+ pts_semantic_mask_path, dtype=np.long)
+
+ results['pts_semantic_mask'] = pts_semantic_mask
+ results['pts_seg_fields'].append('pts_semantic_mask')
+ return results
+
+ def __call__(self, results):
+ """Call function to load multiple types annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded 3D bounding box, label, mask and
+ semantic segmentation annotations.
+ """
+ results = super().__call__(results)
+ if self.with_bbox_3d:
+ results = self._load_bboxes_3d(results)
+ if results is None:
+ return None
+ if self.with_bbox_depth:
+ results = self._load_bboxes_depth(results)
+ if results is None:
+ return None
+ if self.with_label_3d:
+ results = self._load_labels_3d(results)
+ if self.with_attr_label:
+ results = self._load_attr_labels(results)
+ if self.with_mask_3d:
+ results = self._load_masks_3d(results)
+ if self.with_seg_3d:
+ results = self._load_semantic_seg_3d(results)
+
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ indent_str = ' '
+ repr_str = self.__class__.__name__ + '(\n'
+ repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
+ repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
+ repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
+ repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
+ repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
+ repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
+ repr_str += f'{indent_str}with_label={self.with_label}, '
+ repr_str += f'{indent_str}with_mask={self.with_mask}, '
+ repr_str += f'{indent_str}with_seg={self.with_seg}, '
+ repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
+ repr_str += f'{indent_str}poly2mask={self.poly2mask})'
+ return repr_str
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFilesInCeph(object):
+ """Load multi channel images from a list of separate channel files.
+
+ Expects results['img_filename'] to be a list of filenames.
+
+ Args:
+ to_float32 (bool): Whether to convert the img to float32.
+ Defaults to False.
+ color_type (str): Color type of the file. Defaults to 'unchanged'.
+ """
+
+ def __init__(self, to_float32=False, color_type='unchanged', file_client_args=dict(backend='disk'), img_root=''):
+ self.to_float32 = to_float32
+ self.color_type = color_type
+ self.file_client_args = file_client_args.copy()
+ self.file_client = FileClient(**self.file_client_args)
+ self.img_root = img_root
+
+ def __call__(self, results):
+ """Call function to load multi-view image from files.
+
+ Args:
+ results (dict): Result dict containing multi-view image filenames.
+
+ Returns:
+ dict: The result dict containing the multi-view image data. \
+ Added keys and values are described below.
+
+ - filename (list of str): Multi-view image filenames.
+ - img (np.ndarray): Multi-view image arrays.
+ - img_shape (tuple[int]): Shape of multi-view image arrays.
+ - ori_shape (tuple[int]): Shape of original image arrays.
+ - pad_shape (tuple[int]): Shape of padded image arrays.
+ - scale_factor (float): Scale factor.
+ - img_norm_cfg (dict): Normalization configuration of images.
+ """
+ images_multiView = []
+ filename = results['img_filename']
+ for img_path in filename:
+ # img_path = os.path.join(self.img_root, img_path)
+ if self.file_client_args['backend'] == 'petrel':
+ img_bytes = self.file_client.get(img_path)
+ img = imfrombytes(img_bytes)
+ elif self.file_client_args['backend'] == 'disk':
+ img = imread(img_path, self.color_type)
+ images_multiView.append(img)
+ # img is of shape (h, w, c, num_views)
+ img = np.stack(
+ #[mmcv.imread(name, self.color_type) for name in filename], axis=-1)
+ images_multiView, axis=-1)
+ if self.to_float32:
+ img = img.astype(np.float32)
+ results['filename'] = filename
+ # unravel to list, see `DefaultFormatBundle` in formating.py
+ # which will transpose each image separately and then stack into array
+ results['img'] = [img[..., i] for i in range(img.shape[-1])]
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ # Set initial values for default meta_keys
+ results['pad_shape'] = img.shape
+ results['scale_factor'] = 1.0
+ num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+ results['img_norm_cfg'] = dict(
+ mean=np.zeros(num_channels, dtype=np.float32),
+ std=np.ones(num_channels, dtype=np.float32),
+ to_rgb=False)
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(to_float32={self.to_float32}, '
+ repr_str += f"color_type='{self.color_type}')"
+ return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations3D_E2E(LoadAnnotations3D):
+ """Load Annotations3D.
+
+ Load instance mask and semantic mask of points and
+ encapsulate the items into related fields.
+
+ Args:
+ with_bbox_3d (bool, optional): Whether to load 3D boxes.
+ Defaults to True.
+ with_label_3d (bool, optional): Whether to load 3D labels.
+ Defaults to True.
+ with_attr_label (bool, optional): Whether to load attribute label.
+ Defaults to False.
+ with_mask_3d (bool, optional): Whether to load 3D instance masks.
+ for points. Defaults to False.
+ with_seg_3d (bool, optional): Whether to load 3D semantic masks.
+ for points. Defaults to False.
+ with_bbox (bool, optional): Whether to load 2D boxes.
+ Defaults to False.
+ with_label (bool, optional): Whether to load 2D labels.
+ Defaults to False.
+ with_mask (bool, optional): Whether to load 2D instance masks.
+ Defaults to False.
+ with_seg (bool, optional): Whether to load 2D semantic masks.
+ Defaults to False.
+ with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+ Defaults to False.
+ poly2mask (bool, optional): Whether to convert polygon annotations
+ to bitmasks. Defaults to True.
+ seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
+ Defaults to int64
+ file_client_args (dict): Config dict of file clients, refer to
+ https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+ for more details.
+ """
+ def __init__(self,
+ with_future_anns=False,
+ with_ins_inds_3d=False,
+ with_vis_token=True,
+ ins_inds_add_1=False, # NOTE: make ins_inds start from 1, not 0
+ **kwargs):
+ super().__init__(**kwargs)
+ self.with_future_anns = with_future_anns
+ self.with_ins_inds_3d = with_ins_inds_3d
+ self.with_vis_token = with_vis_token
+ self.ins_inds_add_1 = ins_inds_add_1
+
+ def _load_future_anns(self, results):
+ """Private function to load 3D bounding box annotations.
+
+ Args:
+ results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+ Returns:
+ dict: The dict containing loaded 3D bounding box annotations.
+ """
+
+ gt_bboxes_3d = []
+ gt_labels_3d = []
+ gt_inds_3d = []
+ # gt_valid_flags = []
+ gt_vis_tokens = []
+
+ for ann_info in results['occ_future_ann_infos']:
+ if ann_info is not None:
+ gt_bboxes_3d.append(ann_info['gt_bboxes_3d'])
+ gt_labels_3d.append(ann_info['gt_labels_3d'])
+
+ ann_gt_inds = ann_info['gt_inds']
+ if self.ins_inds_add_1:
+ ann_gt_inds += 1
+ # NOTE: sdc query is changed from -10 -> -9
+ gt_inds_3d.append(ann_gt_inds)
+
+ # gt_valid_flags.append(ann_info['gt_valid_flag'])
+ if self.with_vis_token:
+ gt_vis_tokens.append(ann_info['gt_vis_tokens'])
+ else:
+ # invalid frame
+ gt_bboxes_3d.append(None)
+ gt_labels_3d.append(None)
+ gt_inds_3d.append(None)
+ # gt_valid_flags.append(None)
+ if self.with_vis_token:
+ gt_vis_tokens.append(None)
+
+ results['future_gt_bboxes_3d'] = gt_bboxes_3d
+ # results['future_bbox3d_fields'].append('gt_bboxes_3d') # Field is used for augmentations, not needed here
+ results['future_gt_labels_3d'] = gt_labels_3d
+ results['future_gt_inds'] = gt_inds_3d
+ # results['future_gt_valid_flag'] = gt_valid_flags
+ if self.with_vis_token:
+ results['future_gt_vis_tokens'] = gt_vis_tokens
+
+ return results
+
+ def _load_ins_inds_3d(self, results):
+ ann_gt_inds = results['ann_info']['gt_inds'].copy() # TODO: note here
+
+ # NOTE: Avoid gt_inds generated twice
+ results['ann_info'].pop('gt_inds')
+
+ if self.ins_inds_add_1:
+ ann_gt_inds += 1
+ results['gt_inds'] = ann_gt_inds
+ return results
+
+ def __call__(self, results):
+ results = super().__call__(results)
+
+ if self.with_future_anns:
+ results = self._load_future_anns(results)
+ if self.with_ins_inds_3d:
+ results = self._load_ins_inds_3d(results)
+
+ # Generate ann for plan
+ if 'occ_future_ann_infos_for_plan' in results.keys():
+ results = self._load_future_anns_plan(results)
+
+ return results
+
+ def __repr__(self):
+ repr_str = super().__repr__()
+ indent_str = ' '
+ repr_str += f'{indent_str}with_future_anns={self.with_future_anns}, '
+ repr_str += f'{indent_str}with_ins_inds_3d={self.with_ins_inds_3d}, '
+
+ return repr_str
+
+
+def load_augmented_point_cloud(path, virtual=False, reduce_beams=32):
+ # NOTE: following Tianwei's implementation, it is hard coded for nuScenes
+ points = np.fromfile(path, dtype=np.float32).reshape(-1, 5)
+ # NOTE: path definition different from Tianwei's implementation.
+ tokens = path.split("/")
+ vp_dir = "_VIRTUAL" if reduce_beams == 32 else f"_VIRTUAL_{reduce_beams}BEAMS"
+ seg_path = os.path.join(
+ *tokens[:-3],
+ "virtual_points",
+ tokens[-3],
+ tokens[-2] + vp_dir,
+ tokens[-1] + ".pkl.npy",
+ )
+ assert os.path.exists(seg_path)
+ data_dict = np.load(seg_path, allow_pickle=True).item()
+
+ virtual_points1 = data_dict["real_points"]
+ # NOTE: add zero reflectance to virtual points instead of removing them from real points
+ virtual_points2 = np.concatenate(
+ [
+ data_dict["virtual_points"][:, :3],
+ np.zeros([data_dict["virtual_points"].shape[0], 1]),
+ data_dict["virtual_points"][:, 3:],
+ ],
+ axis=-1,
+ )
+
+ points = np.concatenate(
+ [
+ points,
+ np.ones([points.shape[0], virtual_points1.shape[1] - points.shape[1] + 1]),
+ ],
+ axis=1,
+ )
+ virtual_points1 = np.concatenate(
+ [virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1
+ )
+ # note: this part is different from Tianwei's implementation, we don't have duplicate foreground real points.
+ if len(data_dict["real_points_indice"]) > 0:
+ points[data_dict["real_points_indice"]] = virtual_points1
+ if virtual:
+ virtual_points2 = np.concatenate(
+ [virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1
+ )
+ points = np.concatenate([points, virtual_points2], axis=0).astype(np.float32)
+ return points
+
+
+def reduce_LiDAR_beams(pts, reduce_beams_to=32):
+ # print(pts.size())
+ if isinstance(pts, np.ndarray):
+ pts = torch.from_numpy(pts)
+ radius = torch.sqrt(pts[:, 0].pow(2) + pts[:, 1].pow(2) + pts[:, 2].pow(2))
+ sine_theta = pts[:, 2] / radius
+ # [-pi/2, pi/2]
+ theta = torch.asin(sine_theta)
+ phi = torch.atan2(pts[:, 1], pts[:, 0])
+
+ top_ang = 0.1862
+ down_ang = -0.5353
+
+ beam_range = torch.zeros(32)
+ beam_range[0] = top_ang
+ beam_range[31] = down_ang
+
+ for i in range(1, 31):
+ beam_range[i] = beam_range[i - 1] - 0.023275
+ # beam_range = [1, 0.18, 0.15, 0.13, 0.11, 0.085, 0.065, 0.03, 0.01, -0.01, -0.03, -0.055, -0.08, -0.105, -0.13, -0.155, -0.18, -0.205, -0.228, -0.251, -0.275,
+ # -0.295, -0.32, -0.34, -0.36, -0.38, -0.40, -0.425, -0.45, -0.47, -0.49, -0.52, -0.54]
+
+ num_pts, _ = pts.size()
+ mask = torch.zeros(num_pts)
+ if reduce_beams_to == 16:
+ for id in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]:
+ beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+ theta > (beam_range[id] - 0.012)
+ )
+ mask = mask + beam_mask
+ mask = mask.bool()
+ elif reduce_beams_to == 4:
+ for id in [7, 9, 11, 13]:
+ beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+ theta > (beam_range[id] - 0.012)
+ )
+ mask = mask + beam_mask
+ mask = mask.bool()
+ # [?] pick the 14th beam
+ elif reduce_beams_to == 1:
+ chosen_beam_id = 9
+ mask = (theta < (beam_range[chosen_beam_id - 1] - 0.012)) * (
+ theta > (beam_range[chosen_beam_id] - 0.012)
+ )
+ else:
+ raise NotImplementedError
+ # points = copy.copy(pts)
+ points = pts[mask]
+ # print(points.size())
+ return points.numpy()
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromMultiSweeps:
+ """Load points from multiple sweeps.
+
+ This is usually used for nuScenes dataset to utilize previous sweeps.
+
+ Args:
+ sweeps_num (int): Number of sweeps. Defaults to 10.
+ load_dim (int): Dimension number of the loaded points. Defaults to 5.
+ use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+ pad_empty_sweeps (bool): Whether to repeat keyframe when
+ sweeps is empty. Defaults to False.
+ remove_close (bool): Whether to remove close points.
+ Defaults to False.
+ test_mode (bool): If test_model=True used for testing, it will not
+ randomly sample sweeps but select the nearest N frames.
+ Defaults to False.
+ """
+
+ def __init__(
+ self,
+ sweeps_num=10,
+ load_dim=5,
+ use_dim=[0, 1, 2, 4],
+ pad_empty_sweeps=False,
+ remove_close=False,
+ test_mode=False,
+ load_augmented=None,
+ reduce_beams=None,
+ ):
+ self.load_dim = load_dim
+ self.sweeps_num = sweeps_num
+ if isinstance(use_dim, int):
+ use_dim = list(range(use_dim))
+ self.use_dim = use_dim
+ self.pad_empty_sweeps = pad_empty_sweeps
+ self.remove_close = remove_close
+ self.test_mode = test_mode
+ self.load_augmented = load_augmented
+ self.reduce_beams = reduce_beams
+
+ def _load_points(self, lidar_path):
+ """Private function to load point clouds data.
+
+ Args:
+ lidar_path (str): Filename of point clouds data.
+
+ Returns:
+ np.ndarray: An array containing point clouds data.
+ """
+ mmcv.check_file_exist(lidar_path)
+ if self.load_augmented:
+ assert self.load_augmented in ["pointpainting", "mvp"]
+ virtual = self.load_augmented == "mvp"
+ points = load_augmented_point_cloud(
+ lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+ )
+ elif lidar_path.endswith(".npy"):
+ points = np.load(lidar_path)
+ else:
+ points = np.fromfile(lidar_path, dtype=np.float32)
+ return points
+
+ def _remove_close(self, points, radius=1.0):
+ """Removes point too close within a certain radius from origin.
+
+ Args:
+ points (np.ndarray | :obj:`BasePoints`): Sweep points.
+ radius (float): Radius below which points are removed.
+ Defaults to 1.0.
+
+ Returns:
+ np.ndarray: Points after removing.
+ """
+ if isinstance(points, np.ndarray):
+ points_numpy = points
+ elif isinstance(points, BasePoints):
+ points_numpy = points.tensor.numpy()
+ else:
+ raise NotImplementedError
+ x_filt = np.abs(points_numpy[:, 0]) < radius
+ y_filt = np.abs(points_numpy[:, 1]) < radius
+ not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+ return points[not_close]
+
+ def __call__(self, results):
+ """Call function to load multi-sweep point clouds from files.
+
+ Args:
+ results (dict): Result dict containing multi-sweep point cloud \
+ filenames.
+
+ Returns:
+ dict: The result dict containing the multi-sweep points data. \
+ Added key and value are described below.
+
+ - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \
+ cloud arrays.
+ """
+ points = results["points"]
+ points.tensor[:, 4] = 0
+ sweep_points_list = [points]
+ ts = results["timestamp"] / 1e6
+ if self.pad_empty_sweeps and len(results["sweeps"]) == 0:
+ for i in range(self.sweeps_num):
+ if self.remove_close:
+ sweep_points_list.append(self._remove_close(points))
+ else:
+ sweep_points_list.append(points)
+ else:
+ if len(results["sweeps"]) <= self.sweeps_num:
+ choices = np.arange(len(results["sweeps"]))
+ elif self.test_mode:
+ choices = np.arange(self.sweeps_num)
+ else:
+ # NOTE: seems possible to load frame -11?
+ if not self.load_augmented:
+ choices = np.random.choice(
+ len(results["sweeps"]), self.sweeps_num, replace=False
+ )
+ else:
+ # don't allow to sample the earliest frame, match with Tianwei's implementation.
+ choices = np.random.choice(
+ len(results["sweeps"]) - 1, self.sweeps_num, replace=False
+ )
+ for idx in choices:
+ sweep = results["sweeps"][idx]
+ points_sweep = self._load_points(sweep["data_path"])
+ points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+
+ # TODO: make it more general
+ if self.reduce_beams and self.reduce_beams < 32:
+ points_sweep = reduce_LiDAR_beams(points_sweep, self.reduce_beams)
+
+ if self.remove_close:
+ points_sweep = self._remove_close(points_sweep)
+ sweep_ts = sweep["timestamp"] / 1e6
+ points_sweep[:, :3] = (
+ points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T
+ )
+ points_sweep[:, :3] += sweep["sensor2lidar_translation"]
+ points_sweep[:, 4] = ts - sweep_ts
+ points_sweep = points.new_point(points_sweep)
+ sweep_points_list.append(points_sweep)
+
+ points = points.cat(sweep_points_list)
+ points = points[:, self.use_dim]
+ results["points"] = points
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ return f"{self.__class__.__name__}(sweeps_num={self.sweeps_num})"
+
+
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromFile:
+ """Load Points From File.
+
+ Load sunrgbd and scannet points from file.
+
+ Args:
+ coord_type (str): The type of coordinates of points cloud.
+ Available options includes:
+ - 'LIDAR': Points in LiDAR coordinates.
+ - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+ - 'CAMERA': Points in camera coordinates.
+ load_dim (int): The dimension of the loaded points.
+ Defaults to 6.
+ use_dim (list[int]): Which dimensions of the points to be used.
+ Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+ or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+ shift_height (bool): Whether to use shifted height. Defaults to False.
+ use_color (bool): Whether to use color features. Defaults to False.
+ """
+
+ def __init__(
+ self,
+ coord_type,
+ load_dim=6,
+ use_dim=[0, 1, 2],
+ shift_height=False,
+ use_color=False,
+ load_augmented=None,
+ reduce_beams=None,
+ ):
+ self.shift_height = shift_height
+ self.use_color = use_color
+ if isinstance(use_dim, int):
+ use_dim = list(range(use_dim))
+ assert (
+ max(use_dim) < load_dim
+ ), f"Expect all used dimensions < {load_dim}, got {use_dim}"
+ assert coord_type in ["CAMERA", "LIDAR", "DEPTH"]
+
+ self.coord_type = coord_type
+ self.load_dim = load_dim
+ self.use_dim = use_dim
+ self.load_augmented = load_augmented
+ self.reduce_beams = reduce_beams
+
+ def _load_points(self, lidar_path):
+ """Private function to load point clouds data.
+
+ Args:
+ lidar_path (str): Filename of point clouds data.
+
+ Returns:
+ np.ndarray: An array containing point clouds data.
+ """
+ mmcv.check_file_exist(lidar_path)
+ if self.load_augmented:
+ assert self.load_augmented in ["pointpainting", "mvp"]
+ virtual = self.load_augmented == "mvp"
+ points = load_augmented_point_cloud(
+ lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+ )
+ elif lidar_path.endswith(".npy"):
+ points = np.load(lidar_path)
+ else:
+ points = np.fromfile(lidar_path, dtype=np.float32)
+
+ return points
+
+ def __call__(self, results):
+ """Call function to load points data from file.
+
+ Args:
+ results (dict): Result dict containing point clouds data.
+
+ Returns:
+ dict: The result dict containing the point clouds data. \
+ Added key and value are described below.
+
+ - points (:obj:`BasePoints`): Point clouds data.
+ """
+ lidar_path = results["pts_filename"]
+ points = self._load_points(lidar_path)
+ points = points.reshape(-1, self.load_dim)
+ # TODO: make it more general
+ if self.reduce_beams and self.reduce_beams < 32:
+ points = reduce_LiDAR_beams(points, self.reduce_beams)
+ points = points[:, self.use_dim]
+ attribute_dims = None
+
+ if self.shift_height:
+ floor_height = np.percentile(points[:, 2], 0.99)
+ height = points[:, 2] - floor_height
+ points = np.concatenate(
+ [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1
+ )
+ attribute_dims = dict(height=3)
+
+ if self.use_color:
+ assert len(self.use_dim) >= 6
+ if attribute_dims is None:
+ attribute_dims = dict()
+ attribute_dims.update(
+ dict(
+ color=[
+ points.shape[1] - 3,
+ points.shape[1] - 2,
+ points.shape[1] - 1,
+ ]
+ )
+ )
+
+ points_class = get_points_type(self.coord_type)
+ points = points_class(
+ points, points_dim=points.shape[-1], attribute_dims=attribute_dims
+ )
+ results["points"] = points
+
+ return results
diff --git a/mmcv/datasets/pipelines/occflow_label.py b/mmcv/datasets/pipelines/occflow_label.py
new file mode 100644
index 0000000..5ed8fe4
--- /dev/null
+++ b/mmcv/datasets/pipelines/occflow_label.py
@@ -0,0 +1,286 @@
+import torch
+import numpy as np
+import cv2
+
+from mmcv.models.dense_heads.occ_head_plugin import calculate_birds_eye_view_parameters
+
+from mmcv.datasets.builder import PIPELINES
+import os
+
+@PIPELINES.register_module()
+class GenerateOccFlowLabels(object):
+ def __init__(self, grid_conf, ignore_index=255, only_vehicle=True, filter_invisible=True, deal_instance_255=False,all_classes = None,vehicle_classes = None,plan_classes = None):
+ self.grid_conf = grid_conf
+ self.bev_resolution, self.bev_start_position, self.bev_dimension = calculate_birds_eye_view_parameters(
+ grid_conf['xbound'], grid_conf['ybound'], grid_conf['zbound'],
+ )
+ # convert numpy
+ self.bev_resolution = self.bev_resolution.numpy()
+ self.bev_start_position = self.bev_start_position.numpy()
+ self.bev_dimension = self.bev_dimension.numpy()
+ self.spatial_extent = (grid_conf['xbound'][1], grid_conf['ybound'][1])
+ self.ignore_index = ignore_index
+ self.only_vehicle = only_vehicle
+ self.filter_invisible = filter_invisible
+ self.deal_instance_255 = deal_instance_255
+ assert self.deal_instance_255 is False
+
+
+ if all_classes is None:
+ all_classes = ['car', 'truck', 'construction_vehicle', 'bus', 'trailer',
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']
+ if vehicle_classes is None:
+ vehicle_classes = ['car', 'bus', 'construction_vehicle',
+ 'bicycle', 'motorcycle', 'truck', 'trailer']
+ if plan_classes is None:
+ plan_classes = vehicle_classes + ['pedestrian']
+
+ self.vehicle_cls_ids = np.array([all_classes.index(
+ cls_name) for cls_name in vehicle_classes])
+
+ self.plan_cls_ids = np.array([all_classes.index(
+ cls_name) for cls_name in plan_classes])
+
+ if only_vehicle:
+ self.filter_cls_ids = self.vehicle_cls_ids
+ else:
+ self.filter_cls_ids = self.plan_cls_ids
+
+ def reframe_boxes(self, boxes, t_init, t_curr):
+ l2e_r_mat_curr = t_curr['l2e_r']
+ l2e_t_curr = t_curr['l2e_t']
+ e2g_r_mat_curr = t_curr['e2g_r']
+ e2g_t_curr = t_curr['e2g_t']
+
+ l2e_r_mat_init = t_init['l2e_r']
+ l2e_t_init = t_init['l2e_t']
+ e2g_r_mat_init = t_init['e2g_r']
+ e2g_t_init = t_init['e2g_t']
+
+ # to bbox under curr ego frame # TODO: Uncomment
+ boxes.rotate(l2e_r_mat_curr.T)
+ boxes.translate(l2e_t_curr)
+
+ # to bbox under world frame
+ boxes.rotate(e2g_r_mat_curr.T)
+ boxes.translate(e2g_t_curr)
+
+ # to bbox under initial ego frame, first inverse translate, then inverse rotate
+ boxes.translate(- e2g_t_init)
+ m1 = np.linalg.inv(e2g_r_mat_init)
+ boxes.rotate(m1.T)
+
+ # to bbox under curr ego frame, first inverse translate, then inverse rotate
+ boxes.translate(- l2e_t_init)
+ m2 = np.linalg.inv(l2e_r_mat_init)
+ boxes.rotate(m2.T)
+
+ return boxes
+
+ def __call__(self, results):
+ """
+ # Given lidar frame bboxes for curr frame and each future frame,
+ # generate segmentation, instance, centerness, offset, and fwd flow map
+ """
+ # Avoid ignoring obj with index = self.ignore_index
+ SPECIAL_INDEX = -20
+
+ all_gt_bboxes_3d = results['future_gt_bboxes_3d']
+ all_gt_labels_3d = results['future_gt_labels_3d']
+ all_gt_inds = results['future_gt_inds']
+ if 'future_gt_vis_tokens' in results.keys():
+ all_vis_tokens = results['future_gt_vis_tokens']
+ else:
+ all_vis_tokens = None
+ num_frame = len(all_gt_bboxes_3d)
+
+ # motion related transforms, of seq lengths
+ l2e_r_mats = results['occ_l2e_r_mats']
+ l2e_t_vecs = results['occ_l2e_t_vecs']
+ e2g_r_mats = results['occ_e2g_r_mats']
+ e2g_t_vecs = results['occ_e2g_t_vecs']
+
+ # reference frame transform
+ t_ref = dict(l2e_r=l2e_r_mats[0], l2e_t=l2e_t_vecs[0], e2g_r=e2g_r_mats[0], e2g_t=e2g_t_vecs[0])
+
+ segmentations = []
+ instances = []
+ gt_future_boxes = []
+ gt_future_labels = []
+
+ # num_frame is 5
+ for i in range(num_frame):
+ # bbox, label, index of curr frame
+ gt_bboxes_3d, gt_labels_3d = all_gt_bboxes_3d[i], all_gt_labels_3d[i]
+ ins_inds = all_gt_inds[i]
+ if all_vis_tokens is not None:
+ vis_tokens = all_vis_tokens[i]
+ else:
+ vis_tokens = None
+
+ if gt_bboxes_3d is None:
+ # for invalid samples, no loss calculated
+ segmentation = np.ones(
+ (self.bev_dimension[1], self.bev_dimension[0])) * self.ignore_index
+ instance = np.ones(
+ (self.bev_dimension[1], self.bev_dimension[0])) * self.ignore_index
+ else:
+ # reframe bboxes to reference frame
+ t_curr = dict(l2e_r=l2e_r_mats[i], l2e_t=l2e_t_vecs[i], e2g_r=e2g_r_mats[i], e2g_t=e2g_t_vecs[i])
+ ref_bboxes_3d = self.reframe_boxes(gt_bboxes_3d, t_ref, t_curr)
+ gt_future_boxes.append(ref_bboxes_3d)
+ gt_future_labels.append(gt_labels_3d)
+
+ # for valid samples
+ segmentation = np.zeros(
+ (self.bev_dimension[1], self.bev_dimension[0]))
+ instance = np.zeros(
+ (self.bev_dimension[1], self.bev_dimension[0]))
+
+ if self.only_vehicle:
+ vehicle_mask = np.isin(gt_labels_3d, self.filter_cls_ids)
+ ref_bboxes_3d = ref_bboxes_3d[vehicle_mask]
+ gt_labels_3d = gt_labels_3d[vehicle_mask]
+ ins_inds = ins_inds[vehicle_mask]
+ if vis_tokens is not None:
+ vis_tokens = vis_tokens[vehicle_mask]
+
+ if self.filter_invisible:
+ assert vis_tokens is not None
+ visible_mask = (vis_tokens != 1) # obj are filtered out with visibility(1) between 0 and 40%
+ ref_bboxes_3d = ref_bboxes_3d[visible_mask]
+ gt_labels_3d = gt_labels_3d[visible_mask]
+ ins_inds = ins_inds[visible_mask]
+
+ # valid sample and has objects
+ if len(ref_bboxes_3d.tensor) > 0:
+ bbox_corners = ref_bboxes_3d.corners[:, [
+ 0, 3, 7, 4], :2].numpy()
+ bbox_corners = np.round(
+ (bbox_corners - self.bev_start_position[:2] + self.bev_resolution[:2] / 2.0) / self.bev_resolution[:2]).astype(np.int32)
+
+ for index, gt_ind in enumerate(ins_inds):
+ if gt_ind == self.ignore_index:
+ gt_ind = SPECIAL_INDEX # 255 -> -20
+ poly_region = bbox_corners[index]
+
+ cv2.fillPoly(segmentation, [poly_region], 1.0)
+ cv2.fillPoly(instance, [poly_region], int(gt_ind))
+
+ segmentations.append(segmentation)
+ instances.append(instance)
+
+ # segmentation = 1 where objects are located
+ segmentations = torch.from_numpy(
+ np.stack(segmentations, axis=0)).long()
+ instances = torch.from_numpy(np.stack(instances, axis=0)).long()
+
+ # generate heatmap & offset from segmentation & instance
+ instance_centerness, instance_offset, instance_flow, instance_backward_flow = self.center_offset_flow(
+ instances,
+ all_gt_inds,
+ ignore_index=255,
+ )
+
+ invalid_mask = (segmentations[:, 0, 0] == self.ignore_index)
+ instance_centerness[invalid_mask] = self.ignore_index
+
+ results['gt_occ_has_invalid_frame'] = results.pop('occ_has_invalid_frame')
+ results['gt_occ_img_is_valid'] = results.pop('occ_img_is_valid')
+ results.update({
+ 'gt_segmentation': segmentations,
+ 'gt_instance': instances,
+ 'gt_centerness': instance_centerness,
+ 'gt_offset': instance_offset,
+ 'gt_flow': instance_flow,
+ 'gt_backward_flow': instance_backward_flow,
+ 'gt_future_boxes': gt_future_boxes,
+ 'gt_future_labels': gt_future_labels
+ })
+ return results
+
+ def center_offset_flow(self, instance_img, all_gt_inds, ignore_index=255, sigma=3.0):
+ seq_len, h, w = instance_img.shape
+ # heatmap
+ center_label = torch.zeros(seq_len, 1, h, w)
+ # offset from parts to centers
+ offset_label = ignore_index * torch.ones(seq_len, 2, h, w)
+ # future flow
+ future_displacement_label = ignore_index * torch.ones(seq_len, 2, h, w)
+
+ # backward flow
+ backward_flow = ignore_index * torch.ones(seq_len, 2, h, w)
+
+ # x is vertical displacement, y is horizontal displacement
+ x, y = torch.meshgrid(torch.arange(h, dtype=torch.float),
+ torch.arange(w, dtype=torch.float))
+
+ gt_inds_all = []
+ for ins_inds_per_frame in all_gt_inds:
+ if ins_inds_per_frame is None:
+ continue
+ for ins_ind in ins_inds_per_frame:
+ gt_inds_all.append(ins_ind)
+ gt_inds_unique = np.unique(np.array(gt_inds_all))
+
+ # iterate over all instances across this sequence
+ for instance_id in gt_inds_unique:
+ instance_id = int(instance_id)
+ prev_xc = None
+ prev_yc = None
+ prev_mask = None
+ for t in range(seq_len):
+ instance_mask = (instance_img[t] == instance_id)
+ if instance_mask.sum() == 0:
+ # this instance is not in this frame
+ prev_xc = None
+ prev_yc = None
+ prev_mask = None
+ continue
+
+ # the Bird-Eye-View center of the instance
+ xc = x[instance_mask].mean()
+ yc = y[instance_mask].mean()
+
+ off_x = xc - x
+ off_y = yc - y
+ g = torch.exp(-(off_x ** 2 + off_y ** 2) / sigma ** 2)
+ center_label[t, 0] = torch.maximum(center_label[t, 0], g)
+ offset_label[t, 0, instance_mask] = off_x[instance_mask]
+ offset_label[t, 1, instance_mask] = off_y[instance_mask]
+
+ if prev_xc is not None and instance_mask.sum() > 0:
+ delta_x = xc - prev_xc
+ delta_y = yc - prev_yc
+ future_displacement_label[t-1, 0, prev_mask] = delta_x
+ future_displacement_label[t-1, 1, prev_mask] = delta_y
+ backward_flow[t-1, 0, instance_mask] = -1 * delta_x
+ backward_flow[t-1, 1, instance_mask] = -1 * delta_y
+
+ prev_xc = xc
+ prev_yc = yc
+ prev_mask = instance_mask
+
+ return center_label, offset_label, future_displacement_label, backward_flow
+
+
+ def visualize_instances(self, instances, vis_root=''):
+ if vis_root is not None and vis_root != '':
+ os.makedirs(vis_root, exist_ok=True)
+
+ for i, ins in enumerate(instances):
+ ins_c = ins.astype(np.uint8)
+ ins_c = cv2.applyColorMap(ins_c, cv2.COLORMAP_JET)
+ save_path = os.path.join(vis_root, '{}.png'.format(i))
+ cv2.imwrite(save_path, ins_c)
+
+ vid_path = os.path.join(vis_root, 'vid_ins.avi')
+ height, width = instances[0].shape
+ size = (height, width)
+ v_out = cv2.VideoWriter(vid_path, cv2.VideoWriter_fourcc(*'DIVX'), 4, size)
+ for i in range(len(instances)):
+ ins_c = instances[i].astype(np.uint8)
+ ins_c = cv2.applyColorMap(ins_c, cv2.COLORMAP_JET)
+ v_out.write(ins_c)
+ v_out.release()
+ return
diff --git a/mmcv/datasets/pipelines/test_time_aug.py b/mmcv/datasets/pipelines/test_time_aug.py
new file mode 100644
index 0000000..4c21d4e
--- /dev/null
+++ b/mmcv/datasets/pipelines/test_time_aug.py
@@ -0,0 +1,233 @@
+import warnings
+
+from mmcv.utils import is_list_of
+from copy import deepcopy
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug:
+ """Test-time augmentation with multiple scales and flipping.
+
+ An example configuration is as followed:
+
+ .. code-block::
+
+ img_scale=[(1333, 400), (1333, 800)],
+ flip=True,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ]
+
+ After MultiScaleFLipAug with above configuration, the results are wrapped
+ into lists of the same length as followed:
+
+ .. code-block::
+
+ dict(
+ img=[...],
+ img_shape=[...],
+ scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
+ flip=[False, True, False, True]
+ ...
+ )
+
+ Args:
+ transforms (list[dict]): Transforms to apply in each augmentation.
+ img_scale (tuple | list[tuple] | None): Images scales for resizing.
+ scale_factor (float | list[float] | None): Scale factors for resizing.
+ flip (bool): Whether apply flip augmentation. Default: False.
+ flip_direction (str | list[str]): Flip augmentation directions,
+ options are "horizontal", "vertical" and "diagonal". If
+ flip_direction is a list, multiple flip augmentations will be
+ applied. It has no effect when flip == False. Default:
+ "horizontal".
+ """
+
+ def __init__(self,
+ transforms,
+ img_scale=None,
+ scale_factor=None,
+ flip=False,
+ flip_direction='horizontal'):
+ self.transforms = Compose(transforms)
+ assert (img_scale is None) ^ (scale_factor is None), (
+ 'Must have but only one variable can be setted')
+ if img_scale is not None:
+ self.img_scale = img_scale if isinstance(img_scale,
+ list) else [img_scale]
+ self.scale_key = 'scale'
+ assert is_list_of(self.img_scale, tuple)
+ else:
+ self.img_scale = scale_factor if isinstance(
+ scale_factor, list) else [scale_factor]
+ self.scale_key = 'scale_factor'
+
+ self.flip = flip
+ self.flip_direction = flip_direction if isinstance(
+ flip_direction, list) else [flip_direction]
+ assert is_list_of(self.flip_direction, str)
+ if not self.flip and self.flip_direction != ['horizontal']:
+ warnings.warn(
+ 'flip_direction has no effect when flip is set to False')
+ if (self.flip
+ and not any([t['type'] == 'RandomFlip' for t in transforms])):
+ warnings.warn(
+ 'flip has no effect when RandomFlip is not in transforms')
+
+ def __call__(self, results):
+ """Call function to apply test time augment transforms on results.
+
+ Args:
+ results (dict): Result dict contains the data to transform.
+
+ Returns:
+ dict[str: list]: The augmented data, where each value is wrapped
+ into a list.
+ """
+
+ aug_data = []
+ flip_args = [(False, None)]
+ if self.flip:
+ flip_args += [(True, direction)
+ for direction in self.flip_direction]
+ for scale in self.img_scale:
+ for flip, direction in flip_args:
+ _results = results.copy()
+ _results[self.scale_key] = scale
+ _results['flip'] = flip
+ _results['flip_direction'] = direction
+ data = self.transforms(_results)
+ aug_data.append(data)
+ # list of dict to dict of list
+ aug_data_dict = {key: [] for key in aug_data[0]}
+ for data in aug_data:
+ for key, val in data.items():
+ aug_data_dict[key].append(val)
+ return aug_data_dict
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(transforms={self.transforms}, '
+ repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+ repr_str += f'flip_direction={self.flip_direction})'
+ return repr_str
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug3D(object):
+ """Test-time augmentation with multiple scales and flipping.
+
+ Args:
+ transforms (list[dict]): Transforms to apply in each augmentation.
+ img_scale (tuple | list[tuple]: Images scales for resizing.
+ pts_scale_ratio (float | list[float]): Points scale ratios for
+ resizing.
+ flip (bool): Whether apply flip augmentation. Defaults to False.
+ flip_direction (str | list[str]): Flip augmentation directions
+ for images, options are "horizontal" and "vertical".
+ If flip_direction is list, multiple flip augmentations will
+ be applied. It has no effect when ``flip == False``.
+ Defaults to "horizontal".
+ pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation
+ to point cloud. Defaults to True. Note that it works only when
+ 'flip' is turned on.
+ pcd_vertical_flip (bool): Whether apply vertical flip augmentation
+ to point cloud. Defaults to True. Note that it works only when
+ 'flip' is turned on.
+ """
+
+ def __init__(self,
+ transforms,
+ img_scale,
+ pts_scale_ratio,
+ flip=False,
+ flip_direction='horizontal',
+ pcd_horizontal_flip=False,
+ pcd_vertical_flip=False):
+ self.transforms = Compose(transforms)
+ self.img_scale = img_scale if isinstance(img_scale,
+ list) else [img_scale]
+ self.pts_scale_ratio = pts_scale_ratio \
+ if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)]
+
+ assert is_list_of(self.img_scale, tuple)
+ assert is_list_of(self.pts_scale_ratio, float)
+
+ self.flip = flip
+ self.pcd_horizontal_flip = pcd_horizontal_flip
+ self.pcd_vertical_flip = pcd_vertical_flip
+
+ self.flip_direction = flip_direction if isinstance(
+ flip_direction, list) else [flip_direction]
+ assert is_list_of(self.flip_direction, str)
+ if not self.flip and self.flip_direction != ['horizontal']:
+ warnings.warn(
+ 'flip_direction has no effect when flip is set to False')
+ if (self.flip and not any([(t['type'] == 'RandomFlip3D'
+ or t['type'] == 'RandomFlip')
+ for t in transforms])):
+ warnings.warn(
+ 'flip has no effect when RandomFlip is not in transforms')
+
+ def __call__(self, results):
+ """Call function to augment common fields in results.
+
+ Args:
+ results (dict): Result dict contains the data to augment.
+
+ Returns:
+ dict: The result dict contains the data that is augmented with \
+ different scales and flips.
+ """
+ aug_data = []
+
+ # modified from `flip_aug = [False, True] if self.flip else [False]`
+ # to reduce unnecessary scenes when using double flip augmentation
+ # during test time
+ flip_aug = [True] if self.flip else [False]
+ pcd_horizontal_flip_aug = [False, True] \
+ if self.flip and self.pcd_horizontal_flip else [False]
+ pcd_vertical_flip_aug = [False, True] \
+ if self.flip and self.pcd_vertical_flip else [False]
+ for scale in self.img_scale:
+ for pts_scale_ratio in self.pts_scale_ratio:
+ for flip in flip_aug:
+ for pcd_horizontal_flip in pcd_horizontal_flip_aug:
+ for pcd_vertical_flip in pcd_vertical_flip_aug:
+ for direction in self.flip_direction:
+ # results.copy will cause bug
+ # since it is shallow copy
+ _results = deepcopy(results)
+ _results['scale'] = scale
+ _results['flip'] = flip
+ _results['pcd_scale_factor'] = \
+ pts_scale_ratio
+ _results['flip_direction'] = direction
+ _results['pcd_horizontal_flip'] = \
+ pcd_horizontal_flip
+ _results['pcd_vertical_flip'] = \
+ pcd_vertical_flip
+ data = self.transforms(_results)
+ aug_data.append(data)
+ # list of dict to dict of list
+ aug_data_dict = {key: [] for key in aug_data[0]}
+ for data in aug_data:
+ for key, val in data.items():
+ aug_data_dict[key].append(val)
+ return aug_data_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(transforms={self.transforms}, '
+ repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+ repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, '
+ repr_str += f'flip_direction={self.flip_direction})'
+ return repr_str
+
diff --git a/mmcv/datasets/pipelines/transforms.py b/mmcv/datasets/pipelines/transforms.py
new file mode 100644
index 0000000..e7776cd
--- /dev/null
+++ b/mmcv/datasets/pipelines/transforms.py
@@ -0,0 +1,1906 @@
+import copy
+import inspect
+
+import numpy as np
+from numpy import random
+
+from mmcv.core.mask.structures import PolygonMasks
+from mmcv.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmcv.utils import is_list_of, is_str
+from mmcv.image import imrescale, imresize, imflip, impad, impad_to_multiple, imnormalize, bgr2hsv, hsv2bgr
+from ..builder import PIPELINES
+
+try:
+ from imagecorruptions import corrupt
+except ImportError:
+ corrupt = None
+
+try:
+ import albumentations
+ from albumentations import Compose
+except ImportError:
+ albumentations = None
+ Compose = None
+
+
+@PIPELINES.register_module()
+class Resize:
+ """Resize images & bbox & mask.
+
+ This transform resizes the input image to some scale. Bboxes and masks are
+ then resized with the same scale factor. If the input dict contains the key
+ "scale", then the scale in the input dict is used, otherwise the specified
+ scale in the init method is used. If the input dict contains the key
+ "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+ scale_factor), the actual scale will be computed by image shape and
+ scale_factor.
+
+ `img_scale` can either be a tuple (single-scale) or a list of tuple
+ (multi-scale). There are 3 multiscale modes:
+
+ - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+ range and multiply it with the image scale.
+ - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+ sample a scale from the multiscale range.
+ - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+ sample a scale from multiple scales.
+
+ Args:
+ img_scale (tuple or list[tuple]): Images scales for resizing.
+ multiscale_mode (str): Either "range" or "value".
+ ratio_range (tuple[float]): (min_ratio, max_ratio)
+ keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+ image.
+ bbox_clip_border (bool, optional): Whether clip the objects outside
+ the border of the image. Defaults to True.
+ backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+ These two backends generates slightly different results. Defaults
+ to 'cv2'.
+ override (bool, optional): Whether to override `scale` and
+ `scale_factor` so as to call resize twice. Default False. If True,
+ after the first resizing, the existed `scale` and `scale_factor`
+ will be ignored so the second resizing can be allowed.
+ This option is a work-around for multiple times of resize in DETR.
+ Defaults to False.
+ """
+
+ def __init__(self,
+ img_scale=None,
+ multiscale_mode='range',
+ ratio_range=None,
+ keep_ratio=True,
+ bbox_clip_border=True,
+ backend='cv2',
+ override=False):
+ if img_scale is None:
+ self.img_scale = None
+ else:
+ if isinstance(img_scale, list):
+ self.img_scale = img_scale
+ else:
+ self.img_scale = [img_scale]
+ assert is_list_of(self.img_scale, tuple)
+
+ if ratio_range is not None:
+ # mode 1: given a scale and a range of image ratio
+ assert len(self.img_scale) == 1
+ else:
+ # mode 2: given multiple scales or a range of scales
+ assert multiscale_mode in ['value', 'range']
+
+ self.backend = backend
+ self.multiscale_mode = multiscale_mode
+ self.ratio_range = ratio_range
+ self.keep_ratio = keep_ratio
+ # TODO: refactor the override option in Resize
+ self.override = override
+ self.bbox_clip_border = bbox_clip_border
+
+ @staticmethod
+ def random_select(img_scales):
+ """Randomly select an img_scale from given candidates.
+
+ Args:
+ img_scales (list[tuple]): Images scales for selection.
+
+ Returns:
+ (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+ where ``img_scale`` is the selected image scale and \
+ ``scale_idx`` is the selected index in the given candidates.
+ """
+
+ assert is_list_of(img_scales, tuple)
+ scale_idx = np.random.randint(len(img_scales))
+ img_scale = img_scales[scale_idx]
+ return img_scale, scale_idx
+
+ @staticmethod
+ def random_sample(img_scales):
+ """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+ Args:
+ img_scales (list[tuple]): Images scale range for sampling.
+ There must be two tuples in img_scales, which specify the lower
+ and upper bound of image scales.
+
+ Returns:
+ (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+ ``img_scale`` is sampled scale and None is just a placeholder \
+ to be consistent with :func:`random_select`.
+ """
+
+ assert is_list_of(img_scales, tuple) and len(img_scales) == 2
+ img_scale_long = [max(s) for s in img_scales]
+ img_scale_short = [min(s) for s in img_scales]
+ long_edge = np.random.randint(
+ min(img_scale_long),
+ max(img_scale_long) + 1)
+ short_edge = np.random.randint(
+ min(img_scale_short),
+ max(img_scale_short) + 1)
+ img_scale = (long_edge, short_edge)
+ return img_scale, None
+
+ @staticmethod
+ def random_sample_ratio(img_scale, ratio_range):
+ """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+ A ratio will be randomly sampled from the range specified by
+ ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+ generate sampled scale.
+
+ Args:
+ img_scale (tuple): Images scale base to multiply with ratio.
+ ratio_range (tuple[float]): The minimum and maximum ratio to scale
+ the ``img_scale``.
+
+ Returns:
+ (tuple, None): Returns a tuple ``(scale, None)``, where \
+ ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+ None is just a placeholder to be consistent with \
+ :func:`random_select`.
+ """
+
+ assert isinstance(img_scale, tuple) and len(img_scale) == 2
+ min_ratio, max_ratio = ratio_range
+ assert min_ratio <= max_ratio
+ ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+ scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+ return scale, None
+
+ def _random_scale(self, results):
+ """Randomly sample an img_scale according to ``ratio_range`` and
+ ``multiscale_mode``.
+
+ If ``ratio_range`` is specified, a ratio will be sampled and be
+ multiplied with ``img_scale``.
+ If multiple scales are specified by ``img_scale``, a scale will be
+ sampled according to ``multiscale_mode``.
+ Otherwise, single scale will be used.
+
+ Args:
+ results (dict): Result dict from :obj:`dataset`.
+
+ Returns:
+ dict: Two new keys 'scale` and 'scale_idx` are added into \
+ ``results``, which would be used by subsequent pipelines.
+ """
+
+ if self.ratio_range is not None:
+ scale, scale_idx = self.random_sample_ratio(
+ self.img_scale[0], self.ratio_range)
+ elif len(self.img_scale) == 1:
+ scale, scale_idx = self.img_scale[0], 0
+ elif self.multiscale_mode == 'range':
+ scale, scale_idx = self.random_sample(self.img_scale)
+ elif self.multiscale_mode == 'value':
+ scale, scale_idx = self.random_select(self.img_scale)
+ else:
+ raise NotImplementedError
+
+ results['scale'] = scale
+ results['scale_idx'] = scale_idx
+
+ def _resize_img(self, results):
+ """Resize images with ``results['scale']``."""
+ for key in results.get('img_fields', ['img']):
+ if self.keep_ratio:
+ img, scale_factor = imrescale(
+ results[key],
+ results['scale'],
+ return_scale=True,
+ backend=self.backend)
+ # the w_scale and h_scale has minor difference
+ # a real fix should be done in the mmcv.imrescale in the future
+ new_h, new_w = img.shape[:2]
+ h, w = results[key].shape[:2]
+ w_scale = new_w / w
+ h_scale = new_h / h
+ else:
+ img, w_scale, h_scale = imresize(
+ results[key],
+ results['scale'],
+ return_scale=True,
+ backend=self.backend)
+ results[key] = img
+
+ scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+ dtype=np.float32)
+ results['img_shape'] = img.shape
+ # in case that there is no padding
+ results['pad_shape'] = img.shape
+ results['scale_factor'] = scale_factor
+ results['keep_ratio'] = self.keep_ratio
+
+ def _resize_bboxes(self, results):
+ """Resize bounding boxes with ``results['scale_factor']``."""
+ for key in results.get('bbox_fields', []):
+ bboxes = results[key] * results['scale_factor']
+ if self.bbox_clip_border:
+ img_shape = results['img_shape']
+ bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+ bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+ results[key] = bboxes
+
+ def _resize_masks(self, results):
+ """Resize masks with ``results['scale']``"""
+ for key in results.get('mask_fields', []):
+ if results[key] is None:
+ continue
+ if self.keep_ratio:
+ results[key] = results[key].rescale(results['scale'])
+ else:
+ results[key] = results[key].resize(results['img_shape'][:2])
+
+ def _resize_seg(self, results):
+ """Resize semantic segmentation map with ``results['scale']``."""
+ for key in results.get('seg_fields', []):
+ if self.keep_ratio:
+ gt_seg = imrescale(
+ results[key],
+ results['scale'],
+ interpolation='nearest',
+ backend=self.backend)
+ else:
+ gt_seg = imresize(
+ results[key],
+ results['scale'],
+ interpolation='nearest',
+ backend=self.backend)
+ results['gt_semantic_seg'] = gt_seg
+
+ def __call__(self, results):
+ """Call function to resize images, bounding boxes, masks, semantic
+ segmentation map.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+ 'keep_ratio' keys are added into result dict.
+ """
+
+ if 'scale' not in results:
+ if 'scale_factor' in results:
+ img_shape = results['img'].shape[:2]
+ scale_factor = results['scale_factor']
+ assert isinstance(scale_factor, float)
+ results['scale'] = tuple(
+ [int(x * scale_factor) for x in img_shape][::-1])
+ else:
+ self._random_scale(results)
+ else:
+ if not self.override:
+ assert 'scale_factor' not in results, (
+ 'scale and scale_factor cannot be both set.')
+ else:
+ results.pop('scale')
+ if 'scale_factor' in results:
+ results.pop('scale_factor')
+ self._random_scale(results)
+
+ self._resize_img(results)
+ self._resize_bboxes(results)
+ self._resize_masks(results)
+ self._resize_seg(results)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(img_scale={self.img_scale}, '
+ repr_str += f'multiscale_mode={self.multiscale_mode}, '
+ repr_str += f'ratio_range={self.ratio_range}, '
+ repr_str += f'keep_ratio={self.keep_ratio}, '
+ repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip:
+ """Flip the image & bbox & mask.
+
+ If the input dict contains the key "flip", then the flag will be used,
+ otherwise it will be randomly decided by a ratio specified in the init
+ method.
+
+ When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+ float/string or tuple of float/string. There are 3 flip modes:
+
+ - ``flip_ratio`` is float, ``direction`` is string: the image will be
+ ``direction``ly flipped with probability of ``flip_ratio`` .
+ E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+ then image will be horizontally flipped with probability of 0.5.
+ - ``flip_ratio`` is float, ``direction`` is list of string: the image wil
+ be ``direction[i]``ly flipped with probability of
+ ``flip_ratio/len(direction)``.
+ E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+ then image will be horizontally flipped with probability of 0.25,
+ vertically with probability of 0.25.
+ - ``flip_ratio`` is list of float, ``direction`` is list of string:
+ given ``len(flip_ratio) == len(direction)``, the image wil
+ be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+ E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+ 'vertical']``, then image will be horizontally flipped with probability
+ of 0.3, vertically with probability of 0.5
+
+ Args:
+ flip_ratio (float | list[float], optional): The flipping probability.
+ Default: None.
+ direction(str | list[str], optional): The flipping direction. Options
+ are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+ If input is a list, the length must equal ``flip_ratio``. Each
+ element in ``flip_ratio`` indicates the flip probability of
+ corresponding direction.
+ """
+
+ def __init__(self, flip_ratio=None, direction='horizontal'):
+ if isinstance(flip_ratio, list):
+ assert is_list_of(flip_ratio, float)
+ assert 0 <= sum(flip_ratio) <= 1
+ elif isinstance(flip_ratio, float):
+ assert 0 <= flip_ratio <= 1
+ elif flip_ratio is None:
+ pass
+ else:
+ raise ValueError('flip_ratios must be None, float, '
+ 'or list of float')
+ self.flip_ratio = flip_ratio
+
+ valid_directions = ['horizontal', 'vertical', 'diagonal']
+ if isinstance(direction, str):
+ assert direction in valid_directions
+ elif isinstance(direction, list):
+ assert is_list_of(direction, str)
+ assert set(direction).issubset(set(valid_directions))
+ else:
+ raise ValueError('direction must be either str or list of str')
+ self.direction = direction
+
+ if isinstance(flip_ratio, list):
+ assert len(self.flip_ratio) == len(self.direction)
+
+ def bbox_flip(self, bboxes, img_shape, direction):
+ """Flip bboxes horizontally.
+
+ Args:
+ bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+ img_shape (tuple[int]): Image shape (height, width)
+ direction (str): Flip direction. Options are 'horizontal',
+ 'vertical'.
+
+ Returns:
+ numpy.ndarray: Flipped bounding boxes.
+ """
+
+ assert bboxes.shape[-1] % 4 == 0
+ flipped = bboxes.copy()
+ if direction == 'horizontal':
+ w = img_shape[1]
+ flipped[..., 0::4] = w - bboxes[..., 2::4]
+ flipped[..., 2::4] = w - bboxes[..., 0::4]
+ elif direction == 'vertical':
+ h = img_shape[0]
+ flipped[..., 1::4] = h - bboxes[..., 3::4]
+ flipped[..., 3::4] = h - bboxes[..., 1::4]
+ elif direction == 'diagonal':
+ w = img_shape[1]
+ h = img_shape[0]
+ flipped[..., 0::4] = w - bboxes[..., 2::4]
+ flipped[..., 1::4] = h - bboxes[..., 3::4]
+ flipped[..., 2::4] = w - bboxes[..., 0::4]
+ flipped[..., 3::4] = h - bboxes[..., 1::4]
+ else:
+ raise ValueError(f"Invalid flipping direction '{direction}'")
+ return flipped
+
+ def __call__(self, results):
+ """Call function to flip bounding boxes, masks, semantic segmentation
+ maps.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Flipped results, 'flip', 'flip_direction' keys are added \
+ into result dict.
+ """
+
+ if 'flip' not in results:
+ if isinstance(self.direction, list):
+ # None means non-flip
+ direction_list = self.direction + [None]
+ else:
+ # None means non-flip
+ direction_list = [self.direction, None]
+
+ if isinstance(self.flip_ratio, list):
+ non_flip_ratio = 1 - sum(self.flip_ratio)
+ flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+ else:
+ non_flip_ratio = 1 - self.flip_ratio
+ # exclude non-flip
+ single_ratio = self.flip_ratio / (len(direction_list) - 1)
+ flip_ratio_list = [single_ratio] * (len(direction_list) -
+ 1) + [non_flip_ratio]
+
+ cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+ results['flip'] = cur_dir is not None
+ if 'flip_direction' not in results:
+ results['flip_direction'] = cur_dir
+ if results['flip']:
+ # flip image
+ for key in results.get('img_fields', ['img']):
+ results[key] = imflip(
+ results[key], direction=results['flip_direction'])
+ # flip bboxes
+ for key in results.get('bbox_fields', []):
+ results[key] = self.bbox_flip(results[key],
+ results['img_shape'],
+ results['flip_direction'])
+ # flip masks
+ for key in results.get('mask_fields', []):
+ results[key] = results[key].flip(results['flip_direction'])
+
+ # flip segs
+ for key in results.get('seg_fields', []):
+ results[key] = imflip(
+ results[key], direction=results['flip_direction'])
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class RandomShift:
+ """Shift the image and box given shift pixels and probability.
+
+ Args:
+ shift_ratio (float): Probability of shifts. Default 0.5.
+ max_shift_px (int): The max pixels for shifting. Default 32.
+ filter_thr_px (int): The width and height threshold for filtering.
+ The bbox and the rest of the targets below the width and
+ height threshold will be filtered. Default 1.
+ """
+
+ def __init__(self, shift_ratio=0.5, max_shift_px=32, filter_thr_px=1):
+ assert 0 <= shift_ratio <= 1
+ assert max_shift_px >= 0
+ self.shift_ratio = shift_ratio
+ self.max_shift_px = max_shift_px
+ self.filter_thr_px = int(filter_thr_px)
+ # The key correspondence from bboxes to labels.
+ self.bbox2label = {
+ 'gt_bboxes': 'gt_labels',
+ 'gt_bboxes_ignore': 'gt_labels_ignore'
+ }
+
+ def __call__(self, results):
+ """Call function to random shift images, bounding boxes.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Shift results.
+ """
+ if random.random() < self.shift_ratio:
+ img_shape = results['img'].shape[:2]
+
+ random_shift_x = random.randint(-self.max_shift_px,
+ self.max_shift_px)
+ random_shift_y = random.randint(-self.max_shift_px,
+ self.max_shift_px)
+ new_x = max(0, random_shift_x)
+ orig_x = max(0, -random_shift_x)
+ new_y = max(0, random_shift_y)
+ orig_y = max(0, -random_shift_y)
+
+ # TODO: support mask and semantic segmentation maps.
+ for key in results.get('bbox_fields', []):
+ bboxes = results[key].copy()
+ bboxes[..., 0::2] += random_shift_x
+ bboxes[..., 1::2] += random_shift_y
+
+ # clip border
+ bboxes[..., 0::2] = np.clip(bboxes[..., 0::2], 0, img_shape[1])
+ bboxes[..., 1::2] = np.clip(bboxes[..., 1::2], 0, img_shape[0])
+
+ # remove invalid bboxes
+ bbox_w = bboxes[..., 2] - bboxes[..., 0]
+ bbox_h = bboxes[..., 3] - bboxes[..., 1]
+ valid_inds = (bbox_w > self.filter_thr_px) & (
+ bbox_h > self.filter_thr_px)
+ # If the shift does not contain any gt-bbox area, skip this
+ # image.
+ if key == 'gt_bboxes' and not valid_inds.any():
+ return results
+ bboxes = bboxes[valid_inds]
+ results[key] = bboxes
+
+ # label fields. e.g. gt_labels and gt_labels_ignore
+ label_key = self.bbox2label.get(key)
+ if label_key in results:
+ results[label_key] = results[label_key][valid_inds]
+
+ for key in results.get('img_fields', ['img']):
+ img = results[key]
+ new_img = np.zeros_like(img)
+ img_h, img_w = img.shape[:2]
+ new_h = img_h - np.abs(random_shift_y)
+ new_w = img_w - np.abs(random_shift_x)
+ new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+ = img[orig_y:orig_y + new_h, orig_x:orig_x + new_w]
+ results[key] = new_img
+
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(max_shift_px={self.max_shift_px}, '
+ return repr_str
+
+
+@PIPELINES.register_module()
+class Pad:
+ """Pad the image & mask.
+
+ There are two padding modes: (1) pad to a fixed size and (2) pad to the
+ minimum size that is divisible by some number.
+ Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+
+ Args:
+ size (tuple, optional): Fixed padding size.
+ size_divisor (int, optional): The divisor of padded size.
+ pad_val (float, optional): Padding value, 0 by default.
+ """
+
+ def __init__(self, size=None, size_divisor=None, pad_val=0):
+ self.size = size
+ self.size_divisor = size_divisor
+ self.pad_val = pad_val
+ # only one of size and size_divisor should be valid
+ assert size is not None or size_divisor is not None
+ assert size is None or size_divisor is None
+
+ def _pad_img(self, results):
+ """Pad images according to ``self.size``."""
+ for key in results.get('img_fields', ['img']):
+ if self.size is not None:
+ padded_img = impad(
+ results[key], shape=self.size, pad_val=self.pad_val)
+ elif self.size_divisor is not None:
+ padded_img = impad_to_multiple(
+ results[key], self.size_divisor, pad_val=self.pad_val)
+ results[key] = padded_img
+ results['pad_shape'] = padded_img.shape
+ results['pad_fixed_size'] = self.size
+ results['pad_size_divisor'] = self.size_divisor
+
+ def _pad_masks(self, results):
+ """Pad masks according to ``results['pad_shape']``."""
+ pad_shape = results['pad_shape'][:2]
+ for key in results.get('mask_fields', []):
+ results[key] = results[key].pad(pad_shape, pad_val=self.pad_val)
+
+ def _pad_seg(self, results):
+ """Pad semantic segmentation map according to
+ ``results['pad_shape']``."""
+ for key in results.get('seg_fields', []):
+ results[key] = impad(
+ results[key], shape=results['pad_shape'][:2])
+
+ def __call__(self, results):
+ """Call function to pad images, masks, semantic segmentation maps.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Updated result dict.
+ """
+ self._pad_img(results)
+ self._pad_masks(results)
+ self._pad_seg(results)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(size={self.size}, '
+ repr_str += f'size_divisor={self.size_divisor}, '
+ repr_str += f'pad_val={self.pad_val})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize:
+ """Normalize the image.
+
+ Added key is "img_norm_cfg".
+
+ Args:
+ mean (sequence): Mean values of 3 channels.
+ std (sequence): Std values of 3 channels.
+ to_rgb (bool): Whether to convert the image from BGR to RGB,
+ default is true.
+ """
+
+ def __init__(self, mean, std, to_rgb=True):
+ self.mean = np.array(mean, dtype=np.float32)
+ self.std = np.array(std, dtype=np.float32)
+ self.to_rgb = to_rgb
+
+ def __call__(self, results):
+ """Call function to normalize images.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Normalized results, 'img_norm_cfg' key is added into
+ result dict.
+ """
+ for key in results.get('img_fields', ['img']):
+ results[key] = imnormalize(results[key], self.mean, self.std,
+ self.to_rgb)
+ results['img_norm_cfg'] = dict(
+ mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCrop:
+ """Random crop the image & bboxes & masks.
+
+ The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
+ then the cropped results are generated.
+
+ Args:
+ crop_size (tuple): The relative ratio or absolute pixels of
+ height and width.
+ crop_type (str, optional): one of "relative_range", "relative",
+ "absolute", "absolute_range". "relative" randomly crops
+ (h * crop_size[0], w * crop_size[1]) part from an input of size
+ (h, w). "relative_range" uniformly samples relative crop size from
+ range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+ respectively. "absolute" crops from an input with absolute size
+ (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+ crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+ in range [crop_size[0], min(w, crop_size[1])]. Default "absolute".
+ allow_negative_crop (bool, optional): Whether to allow a crop that does
+ not contain any bbox area. Default False.
+ bbox_clip_border (bool, optional): Whether clip the objects outside
+ the border of the image. Defaults to True.
+
+ Note:
+ - If the image is smaller than the absolute crop size, return the
+ original image.
+ - The keys for bboxes, labels and masks must be aligned. That is,
+ `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
+ `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
+ `gt_masks_ignore`.
+ - If the crop does not contain any gt-bbox region and
+ `allow_negative_crop` is set to False, skip this image.
+ """
+
+ def __init__(self,
+ crop_size,
+ crop_type='absolute',
+ allow_negative_crop=False,
+ bbox_clip_border=True):
+ if crop_type not in [
+ 'relative_range', 'relative', 'absolute', 'absolute_range'
+ ]:
+ raise ValueError(f'Invalid crop_type {crop_type}.')
+ if crop_type in ['absolute', 'absolute_range']:
+ assert crop_size[0] > 0 and crop_size[1] > 0
+ assert isinstance(crop_size[0], int) and isinstance(
+ crop_size[1], int)
+ else:
+ assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+ self.crop_size = crop_size
+ self.crop_type = crop_type
+ self.allow_negative_crop = allow_negative_crop
+ self.bbox_clip_border = bbox_clip_border
+ # The key correspondence from bboxes to labels and masks.
+ self.bbox2label = {
+ 'gt_bboxes': 'gt_labels',
+ 'gt_bboxes_ignore': 'gt_labels_ignore'
+ }
+ self.bbox2mask = {
+ 'gt_bboxes': 'gt_masks',
+ 'gt_bboxes_ignore': 'gt_masks_ignore'
+ }
+
+ def _crop_data(self, results, crop_size, allow_negative_crop):
+ """Function to randomly crop images, bounding boxes, masks, semantic
+ segmentation maps.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+ crop_size (tuple): Expected absolute size after cropping, (h, w).
+ allow_negative_crop (bool): Whether to allow a crop that does not
+ contain any bbox area. Default to False.
+
+ Returns:
+ dict: Randomly cropped results, 'img_shape' key in result dict is
+ updated according to crop size.
+ """
+ assert crop_size[0] > 0 and crop_size[1] > 0
+ for key in results.get('img_fields', ['img']):
+ img = results[key]
+ margin_h = max(img.shape[0] - crop_size[0], 0)
+ margin_w = max(img.shape[1] - crop_size[1], 0)
+ offset_h = np.random.randint(0, margin_h + 1)
+ offset_w = np.random.randint(0, margin_w + 1)
+ crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+ crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+ # crop the image
+ img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+ img_shape = img.shape
+ results[key] = img
+ results['img_shape'] = img_shape
+
+ # crop bboxes accordingly and clip to the image boundary
+ for key in results.get('bbox_fields', []):
+ # e.g. gt_bboxes and gt_bboxes_ignore
+ bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+ dtype=np.float32)
+ bboxes = results[key] - bbox_offset
+ if self.bbox_clip_border:
+ bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+ bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+ valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+ bboxes[:, 3] > bboxes[:, 1])
+ # If the crop does not contain any gt-bbox area and
+ # allow_negative_crop is False, skip this image.
+ if (key == 'gt_bboxes' and not valid_inds.any()
+ and not allow_negative_crop):
+ return None
+ results[key] = bboxes[valid_inds, :]
+ # label fields. e.g. gt_labels and gt_labels_ignore
+ label_key = self.bbox2label.get(key)
+ if label_key in results:
+ results[label_key] = results[label_key][valid_inds]
+
+ # mask fields, e.g. gt_masks and gt_masks_ignore
+ mask_key = self.bbox2mask.get(key)
+ if mask_key in results:
+ results[mask_key] = results[mask_key][
+ valid_inds.nonzero()[0]].crop(
+ np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+
+ # crop semantic seg
+ for key in results.get('seg_fields', []):
+ results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+ return results
+
+ def _get_crop_size(self, image_size):
+ """Randomly generates the absolute crop size based on `crop_type` and
+ `image_size`.
+
+ Args:
+ image_size (tuple): (h, w).
+
+ Returns:
+ crop_size (tuple): (crop_h, crop_w) in absolute pixels.
+ """
+ h, w = image_size
+ if self.crop_type == 'absolute':
+ return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+ elif self.crop_type == 'absolute_range':
+ assert self.crop_size[0] <= self.crop_size[1]
+ crop_h = np.random.randint(
+ min(h, self.crop_size[0]),
+ min(h, self.crop_size[1]) + 1)
+ crop_w = np.random.randint(
+ min(w, self.crop_size[0]),
+ min(w, self.crop_size[1]) + 1)
+ return crop_h, crop_w
+ elif self.crop_type == 'relative':
+ crop_h, crop_w = self.crop_size
+ return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+ elif self.crop_type == 'relative_range':
+ crop_size = np.asarray(self.crop_size, dtype=np.float32)
+ crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+ return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+ def __call__(self, results):
+ """Call function to randomly crop images, bounding boxes, masks,
+ semantic segmentation maps.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Randomly cropped results, 'img_shape' key in result dict is
+ updated according to crop size.
+ """
+ image_size = results['img'].shape[:2]
+ crop_size = self._get_crop_size(image_size)
+ results = self._crop_data(results, crop_size, self.allow_negative_crop)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(crop_size={self.crop_size}, '
+ repr_str += f'crop_type={self.crop_type}, '
+ repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+ repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class SegRescale:
+ """Rescale semantic segmentation maps.
+
+ Args:
+ scale_factor (float): The scale factor of the final output.
+ backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+ These two backends generates slightly different results. Defaults
+ to 'cv2'.
+ """
+
+ def __init__(self, scale_factor=1, backend='cv2'):
+ self.scale_factor = scale_factor
+ self.backend = backend
+
+ def __call__(self, results):
+ """Call function to scale the semantic segmentation map.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Result dict with semantic segmentation map scaled.
+ """
+
+ for key in results.get('seg_fields', []):
+ if self.scale_factor != 1:
+ results[key] = imrescale(
+ results[key],
+ self.scale_factor,
+ interpolation='nearest',
+ backend=self.backend)
+ return results
+
+ def __repr__(self):
+ return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortion:
+ """Apply photometric distortion to image sequentially, every transformation
+ is applied with a probability of 0.5. The position of random contrast is in
+ second or second to last.
+
+ 1. random brightness
+ 2. random contrast (mode 0)
+ 3. convert color from BGR to HSV
+ 4. random saturation
+ 5. random hue
+ 6. convert color from HSV to BGR
+ 7. random contrast (mode 1)
+ 8. randomly swap channels
+
+ Args:
+ brightness_delta (int): delta of brightness.
+ contrast_range (tuple): range of contrast.
+ saturation_range (tuple): range of saturation.
+ hue_delta (int): delta of hue.
+ """
+
+ def __init__(self,
+ brightness_delta=32,
+ contrast_range=(0.5, 1.5),
+ saturation_range=(0.5, 1.5),
+ hue_delta=18):
+ self.brightness_delta = brightness_delta
+ self.contrast_lower, self.contrast_upper = contrast_range
+ self.saturation_lower, self.saturation_upper = saturation_range
+ self.hue_delta = hue_delta
+
+ def __call__(self, results):
+ """Call function to perform photometric distortion on images.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Result dict with images distorted.
+ """
+
+ if 'img_fields' in results:
+ assert results['img_fields'] == ['img'], \
+ 'Only single img_fields is allowed'
+ img = results['img']
+ assert img.dtype == np.float32, \
+ 'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+ ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+ # random brightness
+ if random.randint(2):
+ delta = random.uniform(-self.brightness_delta,
+ self.brightness_delta)
+ img += delta
+
+ # mode == 0 --> do random contrast first
+ # mode == 1 --> do random contrast last
+ mode = random.randint(2)
+ if mode == 1:
+ if random.randint(2):
+ alpha = random.uniform(self.contrast_lower,
+ self.contrast_upper)
+ img *= alpha
+
+ # convert color from BGR to HSV
+ img = bgr2hsv(img)
+
+ # random saturation
+ if random.randint(2):
+ img[..., 1] *= random.uniform(self.saturation_lower,
+ self.saturation_upper)
+
+ # random hue
+ if random.randint(2):
+ img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+ img[..., 0][img[..., 0] > 360] -= 360
+ img[..., 0][img[..., 0] < 0] += 360
+
+ # convert color from HSV to BGR
+ img = hsv2bgr(img)
+
+ # random contrast
+ if mode == 0:
+ if random.randint(2):
+ alpha = random.uniform(self.contrast_lower,
+ self.contrast_upper)
+ img *= alpha
+
+ # randomly swap channels
+ if random.randint(2):
+ img = img[..., random.permutation(3)]
+
+ results['img'] = img
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+ repr_str += 'contrast_range='
+ repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+ repr_str += 'saturation_range='
+ repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+ repr_str += f'hue_delta={self.hue_delta})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class Expand:
+ """Random expand the image & bboxes.
+
+ Randomly place the original image on a canvas of 'ratio' x original image
+ size filled with mean values. The ratio is in the range of ratio_range.
+
+ Args:
+ mean (tuple): mean value of dataset.
+ to_rgb (bool): if need to convert the order of mean to align with RGB.
+ ratio_range (tuple): range of expand ratio.
+ prob (float): probability of applying this transformation
+ """
+
+ def __init__(self,
+ mean=(0, 0, 0),
+ to_rgb=True,
+ ratio_range=(1, 4),
+ seg_ignore_label=None,
+ prob=0.5):
+ self.to_rgb = to_rgb
+ self.ratio_range = ratio_range
+ if to_rgb:
+ self.mean = mean[::-1]
+ else:
+ self.mean = mean
+ self.min_ratio, self.max_ratio = ratio_range
+ self.seg_ignore_label = seg_ignore_label
+ self.prob = prob
+
+ def __call__(self, results):
+ """Call function to expand images, bounding boxes.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Result dict with images, bounding boxes expanded
+ """
+
+ if random.uniform(0, 1) > self.prob:
+ return results
+
+ if 'img_fields' in results:
+ assert results['img_fields'] == ['img'], \
+ 'Only single img_fields is allowed'
+ img = results['img']
+
+ h, w, c = img.shape
+ ratio = random.uniform(self.min_ratio, self.max_ratio)
+ # speedup expand when meets large image
+ if np.all(self.mean == self.mean[0]):
+ expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+ img.dtype)
+ expand_img.fill(self.mean[0])
+ else:
+ expand_img = np.full((int(h * ratio), int(w * ratio), c),
+ self.mean,
+ dtype=img.dtype)
+ left = int(random.uniform(0, w * ratio - w))
+ top = int(random.uniform(0, h * ratio - h))
+ expand_img[top:top + h, left:left + w] = img
+
+ results['img'] = expand_img
+ # expand bboxes
+ for key in results.get('bbox_fields', []):
+ results[key] = results[key] + np.tile(
+ (left, top), 2).astype(results[key].dtype)
+
+ # expand masks
+ for key in results.get('mask_fields', []):
+ results[key] = results[key].expand(
+ int(h * ratio), int(w * ratio), top, left)
+
+ # expand segs
+ for key in results.get('seg_fields', []):
+ gt_seg = results[key]
+ expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+ self.seg_ignore_label,
+ dtype=gt_seg.dtype)
+ expand_gt_seg[top:top + h, left:left + w] = gt_seg
+ results[key] = expand_gt_seg
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+ repr_str += f'ratio_range={self.ratio_range}, '
+ repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class MinIoURandomCrop:
+ """Random crop the image & bboxes, the cropped patches have minimum IoU
+ requirement with original image & bboxes, the IoU threshold is randomly
+ selected from min_ious.
+
+ Args:
+ min_ious (tuple): minimum IoU threshold for all intersections with
+ bounding boxes
+ min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+ where a >= min_crop_size).
+ bbox_clip_border (bool, optional): Whether clip the objects outside
+ the border of the image. Defaults to True.
+
+ Note:
+ The keys for bboxes, labels and masks should be paired. That is, \
+ `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+ `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+ """
+
+ def __init__(self,
+ min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+ min_crop_size=0.3,
+ bbox_clip_border=True):
+ # 1: return ori img
+ self.min_ious = min_ious
+ self.sample_mode = (1, *min_ious, 0)
+ self.min_crop_size = min_crop_size
+ self.bbox_clip_border = bbox_clip_border
+ self.bbox2label = {
+ 'gt_bboxes': 'gt_labels',
+ 'gt_bboxes_ignore': 'gt_labels_ignore'
+ }
+ self.bbox2mask = {
+ 'gt_bboxes': 'gt_masks',
+ 'gt_bboxes_ignore': 'gt_masks_ignore'
+ }
+
+ def __call__(self, results):
+ """Call function to crop images and bounding boxes with minimum IoU
+ constraint.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Result dict with images and bounding boxes cropped, \
+ 'img_shape' key is updated.
+ """
+
+ if 'img_fields' in results:
+ assert results['img_fields'] == ['img'], \
+ 'Only single img_fields is allowed'
+ img = results['img']
+ assert 'bbox_fields' in results
+ boxes = [results[key] for key in results['bbox_fields']]
+ boxes = np.concatenate(boxes, 0)
+ h, w, c = img.shape
+ while True:
+ mode = random.choice(self.sample_mode)
+ self.mode = mode
+ if mode == 1:
+ return results
+
+ min_iou = mode
+ for i in range(50):
+ new_w = random.uniform(self.min_crop_size * w, w)
+ new_h = random.uniform(self.min_crop_size * h, h)
+
+ # h / w in [0.5, 2]
+ if new_h / new_w < 0.5 or new_h / new_w > 2:
+ continue
+
+ left = random.uniform(w - new_w)
+ top = random.uniform(h - new_h)
+
+ patch = np.array(
+ (int(left), int(top), int(left + new_w), int(top + new_h)))
+ # Line or point crop is not allowed
+ if patch[2] == patch[0] or patch[3] == patch[1]:
+ continue
+ overlaps = bbox_overlaps(
+ patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+ if len(overlaps) > 0 and overlaps.min() < min_iou:
+ continue
+
+ # center of boxes should inside the crop img
+ # only adjust boxes and instance masks when the gt is not empty
+ if len(overlaps) > 0:
+ # adjust boxes
+ def is_center_of_bboxes_in_patch(boxes, patch):
+ center = (boxes[:, :2] + boxes[:, 2:]) / 2
+ mask = ((center[:, 0] > patch[0]) *
+ (center[:, 1] > patch[1]) *
+ (center[:, 0] < patch[2]) *
+ (center[:, 1] < patch[3]))
+ return mask
+
+ mask = is_center_of_bboxes_in_patch(boxes, patch)
+ if not mask.any():
+ continue
+ for key in results.get('bbox_fields', []):
+ boxes = results[key].copy()
+ mask = is_center_of_bboxes_in_patch(boxes, patch)
+ boxes = boxes[mask]
+ if self.bbox_clip_border:
+ boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+ boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+ boxes -= np.tile(patch[:2], 2)
+
+ results[key] = boxes
+ # labels
+ label_key = self.bbox2label.get(key)
+ if label_key in results:
+ results[label_key] = results[label_key][mask]
+
+ # mask fields
+ mask_key = self.bbox2mask.get(key)
+ if mask_key in results:
+ results[mask_key] = results[mask_key][
+ mask.nonzero()[0]].crop(patch)
+ # adjust the img no matter whether the gt is empty before crop
+ img = img[patch[1]:patch[3], patch[0]:patch[2]]
+ results['img'] = img
+ results['img_shape'] = img.shape
+
+ # seg fields
+ for key in results.get('seg_fields', []):
+ results[key] = results[key][patch[1]:patch[3],
+ patch[0]:patch[2]]
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(min_ious={self.min_ious}, '
+ repr_str += f'min_crop_size={self.min_crop_size}, '
+ repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class Corrupt:
+ """Corruption augmentation.
+
+ Corruption transforms implemented based on
+ `imagecorruptions `_.
+
+ Args:
+ corruption (str): Corruption name.
+ severity (int, optional): The severity of corruption. Default: 1.
+ """
+
+ def __init__(self, corruption, severity=1):
+ self.corruption = corruption
+ self.severity = severity
+
+ def __call__(self, results):
+ """Call function to corrupt image.
+
+ Args:
+ results (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Result dict with images corrupted.
+ """
+
+ if corrupt is None:
+ raise RuntimeError('imagecorruptions is not installed')
+ if 'img_fields' in results:
+ assert results['img_fields'] == ['img'], \
+ 'Only single img_fields is allowed'
+ results['img'] = corrupt(
+ results['img'].astype(np.uint8),
+ corruption_name=self.corruption,
+ severity=self.severity)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(corruption={self.corruption}, '
+ repr_str += f'severity={self.severity})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class Albu:
+ """Albumentation augmentation.
+
+ Adds custom transformations from Albumentations library.
+ Please, visit `https://albumentations.readthedocs.io`
+ to get more information.
+
+ An example of ``transforms`` is as followed:
+
+ .. code-block::
+
+ [
+ dict(
+ type='ShiftScaleRotate',
+ shift_limit=0.0625,
+ scale_limit=0.0,
+ rotate_limit=0,
+ interpolation=1,
+ p=0.5),
+ dict(
+ type='RandomBrightnessContrast',
+ brightness_limit=[0.1, 0.3],
+ contrast_limit=[0.1, 0.3],
+ p=0.2),
+ dict(type='ChannelShuffle', p=0.1),
+ dict(
+ type='OneOf',
+ transforms=[
+ dict(type='Blur', blur_limit=3, p=1.0),
+ dict(type='MedianBlur', blur_limit=3, p=1.0)
+ ],
+ p=0.1),
+ ]
+
+ Args:
+ transforms (list[dict]): A list of albu transformations
+ bbox_params (dict): Bbox_params for albumentation `Compose`
+ keymap (dict): Contains {'input key':'albumentation-style key'}
+ skip_img_without_anno (bool): Whether to skip the image if no ann left
+ after aug
+ """
+
+ def __init__(self,
+ transforms,
+ bbox_params=None,
+ keymap=None,
+ update_pad_shape=False,
+ skip_img_without_anno=False):
+ if Compose is None:
+ raise RuntimeError('albumentations is not installed')
+
+ # Args will be modified later, copying it will be safer
+ transforms = copy.deepcopy(transforms)
+ if bbox_params is not None:
+ bbox_params = copy.deepcopy(bbox_params)
+ if keymap is not None:
+ keymap = copy.deepcopy(keymap)
+ self.transforms = transforms
+ self.filter_lost_elements = False
+ self.update_pad_shape = update_pad_shape
+ self.skip_img_without_anno = skip_img_without_anno
+
+ # A simple workaround to remove masks without boxes
+ if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+ and 'filter_lost_elements' in bbox_params):
+ self.filter_lost_elements = True
+ self.origin_label_fields = bbox_params['label_fields']
+ bbox_params['label_fields'] = ['idx_mapper']
+ del bbox_params['filter_lost_elements']
+
+ self.bbox_params = (
+ self.albu_builder(bbox_params) if bbox_params else None)
+ self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+ bbox_params=self.bbox_params)
+
+ if not keymap:
+ self.keymap_to_albu = {
+ 'img': 'image',
+ 'gt_masks': 'masks',
+ 'gt_bboxes': 'bboxes'
+ }
+ else:
+ self.keymap_to_albu = keymap
+ self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+ def albu_builder(self, cfg):
+ """Import a module from albumentations.
+
+ It inherits some of :func:`build_from_cfg` logic.
+
+ Args:
+ cfg (dict): Config dict. It should at least contain the key "type".
+
+ Returns:
+ obj: The constructed object.
+ """
+
+ assert isinstance(cfg, dict) and 'type' in cfg
+ args = cfg.copy()
+
+ obj_type = args.pop('type')
+ if is_str(obj_type):
+ if albumentations is None:
+ raise RuntimeError('albumentations is not installed')
+ obj_cls = getattr(albumentations, obj_type)
+ elif inspect.isclass(obj_type):
+ obj_cls = obj_type
+ else:
+ raise TypeError(
+ f'type must be a str or valid type, but got {type(obj_type)}')
+
+ if 'transforms' in args:
+ args['transforms'] = [
+ self.albu_builder(transform)
+ for transform in args['transforms']
+ ]
+
+ return obj_cls(**args)
+
+ @staticmethod
+ def mapper(d, keymap):
+ """Dictionary mapper. Renames keys according to keymap provided.
+
+ Args:
+ d (dict): old dict
+ keymap (dict): {'old_key':'new_key'}
+ Returns:
+ dict: new dict.
+ """
+
+ updated_dict = {}
+ for k, v in zip(d.keys(), d.values()):
+ new_k = keymap.get(k, k)
+ updated_dict[new_k] = d[k]
+ return updated_dict
+
+ def __call__(self, results):
+ # dict to albumentations format
+ results = self.mapper(results, self.keymap_to_albu)
+ # TODO: add bbox_fields
+ if 'bboxes' in results:
+ # to list of boxes
+ if isinstance(results['bboxes'], np.ndarray):
+ results['bboxes'] = [x for x in results['bboxes']]
+ # add pseudo-field for filtration
+ if self.filter_lost_elements:
+ results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+ # TODO: Support mask structure in albu
+ if 'masks' in results:
+ if isinstance(results['masks'], PolygonMasks):
+ raise NotImplementedError(
+ 'Albu only supports BitMap masks now')
+ ori_masks = results['masks']
+ if albumentations.__version__ < '0.5':
+ results['masks'] = results['masks'].masks
+ else:
+ results['masks'] = [mask for mask in results['masks'].masks]
+
+ results = self.aug(**results)
+
+ if 'bboxes' in results:
+ if isinstance(results['bboxes'], list):
+ results['bboxes'] = np.array(
+ results['bboxes'], dtype=np.float32)
+ results['bboxes'] = results['bboxes'].reshape(-1, 4)
+
+ # filter label_fields
+ if self.filter_lost_elements:
+
+ for label in self.origin_label_fields:
+ results[label] = np.array(
+ [results[label][i] for i in results['idx_mapper']])
+ if 'masks' in results:
+ results['masks'] = np.array(
+ [results['masks'][i] for i in results['idx_mapper']])
+ results['masks'] = ori_masks.__class__(
+ results['masks'], results['image'].shape[0],
+ results['image'].shape[1])
+
+ if (not len(results['idx_mapper'])
+ and self.skip_img_without_anno):
+ return None
+
+ if 'gt_labels' in results:
+ if isinstance(results['gt_labels'], list):
+ results['gt_labels'] = np.array(results['gt_labels'])
+ results['gt_labels'] = results['gt_labels'].astype(np.int64)
+
+ # back to the original format
+ results = self.mapper(results, self.keymap_back)
+
+ # update final shape
+ if self.update_pad_shape:
+ results['pad_shape'] = results['img'].shape
+
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCenterCropPad:
+ """Random center crop and random around padding for CornerNet.
+
+ This operation generates randomly cropped image from the original image and
+ pads it simultaneously. Different from :class:`RandomCrop`, the output
+ shape may not equal to ``crop_size`` strictly. We choose a random value
+ from ``ratios`` and the output shape could be larger or smaller than
+ ``crop_size``. The padding operation is also different from :class:`Pad`,
+ here we use around padding instead of right-bottom padding.
+
+ The relation between output image (padding image) and original image:
+
+ .. code:: text
+
+ output image
+
+ +----------------------------+
+ | padded area |
+ +------|----------------------------|----------+
+ | | cropped area | |
+ | | +---------------+ | |
+ | | | . center | | | original image
+ | | | range | | |
+ | | +---------------+ | |
+ +------|----------------------------|----------+
+ | padded area |
+ +----------------------------+
+
+ There are 5 main areas in the figure:
+
+ - output image: output image of this operation, also called padding
+ image in following instruction.
+ - original image: input image of this operation.
+ - padded area: non-intersect area of output image and original image.
+ - cropped area: the overlap of output image and original image.
+ - center range: a smaller area where random center chosen from.
+ center range is computed by ``border`` and original image's shape
+ to avoid our random center is too close to original image's border.
+
+ Also this operation act differently in train and test mode, the summary
+ pipeline is listed below.
+
+ Train pipeline:
+
+ 1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+ will be ``random_ratio * crop_size``.
+ 2. Choose a ``random_center`` in center range.
+ 3. Generate padding image with center matches the ``random_center``.
+ 4. Initialize the padding image with pixel value equals to ``mean``.
+ 5. Copy the cropped area to padding image.
+ 6. Refine annotations.
+
+ Test pipeline:
+
+ 1. Compute output shape according to ``test_pad_mode``.
+ 2. Generate padding image with center matches the original image
+ center.
+ 3. Initialize the padding image with pixel value equals to ``mean``.
+ 4. Copy the ``cropped area`` to padding image.
+
+ Args:
+ crop_size (tuple | None): expected size after crop, final size will
+ computed according to ratio. Requires (h, w) in train mode, and
+ None in test mode.
+ ratios (tuple): random select a ratio from tuple and crop image to
+ (crop_size[0] * ratio) * (crop_size[1] * ratio).
+ Only available in train mode.
+ border (int): max distance from center select area to image border.
+ Only available in train mode.
+ mean (sequence): Mean values of 3 channels.
+ std (sequence): Std values of 3 channels.
+ to_rgb (bool): Whether to convert the image from BGR to RGB.
+ test_mode (bool): whether involve random variables in transform.
+ In train mode, crop_size is fixed, center coords and ratio is
+ random selected from predefined lists. In test mode, crop_size
+ is image's original shape, center coords and ratio is fixed.
+ test_pad_mode (tuple): padding method and padding shape value, only
+ available in test mode. Default is using 'logical_or' with
+ 127 as padding shape value.
+
+ - 'logical_or': final_shape = input_shape | padding_shape_value
+ - 'size_divisor': final_shape = int(
+ ceil(input_shape / padding_shape_value) * padding_shape_value)
+ test_pad_add_pix (int): Extra padding pixel in test mode. Default 0.
+ bbox_clip_border (bool, optional): Whether clip the objects outside
+ the border of the image. Defaults to True.
+ """
+
+ def __init__(self,
+ crop_size=None,
+ ratios=(0.9, 1.0, 1.1),
+ border=128,
+ mean=None,
+ std=None,
+ to_rgb=None,
+ test_mode=False,
+ test_pad_mode=('logical_or', 127),
+ test_pad_add_pix=0,
+ bbox_clip_border=True):
+ if test_mode:
+ assert crop_size is None, 'crop_size must be None in test mode'
+ assert ratios is None, 'ratios must be None in test mode'
+ assert border is None, 'border must be None in test mode'
+ assert isinstance(test_pad_mode, (list, tuple))
+ assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+ else:
+ assert isinstance(crop_size, (list, tuple))
+ assert crop_size[0] > 0 and crop_size[1] > 0, (
+ 'crop_size must > 0 in train mode')
+ assert isinstance(ratios, (list, tuple))
+ assert test_pad_mode is None, (
+ 'test_pad_mode must be None in train mode')
+
+ self.crop_size = crop_size
+ self.ratios = ratios
+ self.border = border
+ # We do not set default value to mean, std and to_rgb because these
+ # hyper-parameters are easy to forget but could affect the performance.
+ # Please use the same setting as Normalize for performance assurance.
+ assert mean is not None and std is not None and to_rgb is not None
+ self.to_rgb = to_rgb
+ self.input_mean = mean
+ self.input_std = std
+ if to_rgb:
+ self.mean = mean[::-1]
+ self.std = std[::-1]
+ else:
+ self.mean = mean
+ self.std = std
+ self.test_mode = test_mode
+ self.test_pad_mode = test_pad_mode
+ self.test_pad_add_pix = test_pad_add_pix
+ self.bbox_clip_border = bbox_clip_border
+
+ def _get_border(self, border, size):
+ """Get final border for the target size.
+
+ This function generates a ``final_border`` according to image's shape.
+ The area between ``final_border`` and ``size - final_border`` is the
+ ``center range``. We randomly choose center from the ``center range``
+ to avoid our random center is too close to original image's border.
+ Also ``center range`` should be larger than 0.
+
+ Args:
+ border (int): The initial border, default is 128.
+ size (int): The width or height of original image.
+ Returns:
+ int: The final border.
+ """
+ k = 2 * border / size
+ i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+ return border // i
+
+ def _filter_boxes(self, patch, boxes):
+ """Check whether the center of each box is in the patch.
+
+ Args:
+ patch (list[int]): The cropped area, [left, top, right, bottom].
+ boxes (numpy array, (N x 4)): Ground truth boxes.
+
+ Returns:
+ mask (numpy array, (N,)): Each box is inside or outside the patch.
+ """
+ center = (boxes[:, :2] + boxes[:, 2:]) / 2
+ mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+ center[:, 0] < patch[2]) * (
+ center[:, 1] < patch[3])
+ return mask
+
+ def _crop_image_and_paste(self, image, center, size):
+ """Crop image with a given center and size, then paste the cropped
+ image to a blank image with two centers align.
+
+ This function is equivalent to generating a blank image with ``size``
+ as its shape. Then cover it on the original image with two centers (
+ the center of blank image and the random center of original image)
+ aligned. The overlap area is paste from the original image and the
+ outside area is filled with ``mean pixel``.
+
+ Args:
+ image (np array, H x W x C): Original image.
+ center (list[int]): Target crop center coord.
+ size (list[int]): Target crop size. [target_h, target_w]
+
+ Returns:
+ cropped_img (np array, target_h x target_w x C): Cropped image.
+ border (np array, 4): The distance of four border of
+ ``cropped_img`` to the original image area, [top, bottom,
+ left, right]
+ patch (list[int]): The cropped area, [left, top, right, bottom].
+ """
+ center_y, center_x = center
+ target_h, target_w = size
+ img_h, img_w, img_c = image.shape
+
+ x0 = max(0, center_x - target_w // 2)
+ x1 = min(center_x + target_w // 2, img_w)
+ y0 = max(0, center_y - target_h // 2)
+ y1 = min(center_y + target_h // 2, img_h)
+ patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+ left, right = center_x - x0, x1 - center_x
+ top, bottom = center_y - y0, y1 - center_y
+
+ cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+ cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+ for i in range(img_c):
+ cropped_img[:, :, i] += self.mean[i]
+ y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+ x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+ cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+ border = np.array([
+ cropped_center_y - top, cropped_center_y + bottom,
+ cropped_center_x - left, cropped_center_x + right
+ ],
+ dtype=np.float32)
+
+ return cropped_img, border, patch
+
+ def _train_aug(self, results):
+ """Random crop and around padding the original image.
+
+ Args:
+ results (dict): Image infomations in the augment pipeline.
+
+ Returns:
+ results (dict): The updated dict.
+ """
+ img = results['img']
+ h, w, c = img.shape
+ boxes = results['gt_bboxes']
+ while True:
+ scale = random.choice(self.ratios)
+ new_h = int(self.crop_size[0] * scale)
+ new_w = int(self.crop_size[1] * scale)
+ h_border = self._get_border(self.border, h)
+ w_border = self._get_border(self.border, w)
+
+ for i in range(50):
+ center_x = random.randint(low=w_border, high=w - w_border)
+ center_y = random.randint(low=h_border, high=h - h_border)
+
+ cropped_img, border, patch = self._crop_image_and_paste(
+ img, [center_y, center_x], [new_h, new_w])
+
+ mask = self._filter_boxes(patch, boxes)
+ # if image do not have valid bbox, any crop patch is valid.
+ if not mask.any() and len(boxes) > 0:
+ continue
+
+ results['img'] = cropped_img
+ results['img_shape'] = cropped_img.shape
+ results['pad_shape'] = cropped_img.shape
+
+ x0, y0, x1, y1 = patch
+
+ left_w, top_h = center_x - x0, center_y - y0
+ cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+ # crop bboxes accordingly and clip to the image boundary
+ for key in results.get('bbox_fields', []):
+ mask = self._filter_boxes(patch, results[key])
+ bboxes = results[key][mask]
+ bboxes[:, 0:4:2] += cropped_center_x - left_w - x0
+ bboxes[:, 1:4:2] += cropped_center_y - top_h - y0
+ if self.bbox_clip_border:
+ bboxes[:, 0:4:2] = np.clip(bboxes[:, 0:4:2], 0, new_w)
+ bboxes[:, 1:4:2] = np.clip(bboxes[:, 1:4:2], 0, new_h)
+ keep = (bboxes[:, 2] > bboxes[:, 0]) & (
+ bboxes[:, 3] > bboxes[:, 1])
+ bboxes = bboxes[keep]
+ results[key] = bboxes
+ if key in ['gt_bboxes']:
+ if 'gt_labels' in results:
+ labels = results['gt_labels'][mask]
+ labels = labels[keep]
+ results['gt_labels'] = labels
+ if 'gt_masks' in results:
+ raise NotImplementedError(
+ 'RandomCenterCropPad only supports bbox.')
+
+ # crop semantic seg
+ for key in results.get('seg_fields', []):
+ raise NotImplementedError(
+ 'RandomCenterCropPad only supports bbox.')
+ return results
+
+ def _test_aug(self, results):
+ """Around padding the original image without cropping.
+
+ The padding mode and value are from ``test_pad_mode``.
+
+ Args:
+ results (dict): Image infomations in the augment pipeline.
+
+ Returns:
+ results (dict): The updated dict.
+ """
+ img = results['img']
+ h, w, c = img.shape
+ results['img_shape'] = img.shape
+ if self.test_pad_mode[0] in ['logical_or']:
+ # self.test_pad_add_pix is only used for centernet
+ target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+ target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+ elif self.test_pad_mode[0] in ['size_divisor']:
+ divisor = self.test_pad_mode[1]
+ target_h = int(np.ceil(h / divisor)) * divisor
+ target_w = int(np.ceil(w / divisor)) * divisor
+ else:
+ raise NotImplementedError(
+ 'RandomCenterCropPad only support two testing pad mode:'
+ 'logical-or and size_divisor.')
+
+ cropped_img, border, _ = self._crop_image_and_paste(
+ img, [h // 2, w // 2], [target_h, target_w])
+ results['img'] = cropped_img
+ results['pad_shape'] = cropped_img.shape
+ results['border'] = border
+ return results
+
+ def __call__(self, results):
+ img = results['img']
+ assert img.dtype == np.float32, (
+ 'RandomCenterCropPad needs the input image of dtype np.float32,'
+ ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+ h, w, c = img.shape
+ assert c == len(self.mean)
+ if self.test_mode:
+ return self._test_aug(results)
+ else:
+ return self._train_aug(results)
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(crop_size={self.crop_size}, '
+ repr_str += f'ratios={self.ratios}, '
+ repr_str += f'border={self.border}, '
+ repr_str += f'mean={self.input_mean}, '
+ repr_str += f'std={self.input_std}, '
+ repr_str += f'to_rgb={self.to_rgb}, '
+ repr_str += f'test_mode={self.test_mode}, '
+ repr_str += f'test_pad_mode={self.test_pad_mode}, '
+ repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class CutOut:
+ """CutOut operation.
+
+ Randomly drop some regions of image used in
+ `Cutout `_.
+
+ Args:
+ n_holes (int | tuple[int, int]): Number of regions to be dropped.
+ If it is given as a list, number of holes will be randomly
+ selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+ cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+ shape of dropped regions. It can be `tuple[int, int]` to use a
+ fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+ shape from the list.
+ cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+ candidate ratio of dropped regions. It can be `tuple[float, float]`
+ to use a fixed ratio or `list[tuple[float, float]]` to randomly
+ choose ratio from the list. Please note that `cutout_shape`
+ and `cutout_ratio` cannot be both given at the same time.
+ fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+ of pixel to fill in the dropped regions. Default: (0, 0, 0).
+ """
+
+ def __init__(self,
+ n_holes,
+ cutout_shape=None,
+ cutout_ratio=None,
+ fill_in=(0, 0, 0)):
+
+ assert (cutout_shape is None) ^ (cutout_ratio is None), \
+ 'Either cutout_shape or cutout_ratio should be specified.'
+ assert (isinstance(cutout_shape, (list, tuple))
+ or isinstance(cutout_ratio, (list, tuple)))
+ if isinstance(n_holes, tuple):
+ assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+ else:
+ n_holes = (n_holes, n_holes)
+ self.n_holes = n_holes
+ self.fill_in = fill_in
+ self.with_ratio = cutout_ratio is not None
+ self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+ if not isinstance(self.candidates, list):
+ self.candidates = [self.candidates]
+
+ def __call__(self, results):
+ """Call function to drop some regions of image."""
+ h, w, c = results['img'].shape
+ n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+ for _ in range(n_holes):
+ x1 = np.random.randint(0, w)
+ y1 = np.random.randint(0, h)
+ index = np.random.randint(0, len(self.candidates))
+ if not self.with_ratio:
+ cutout_w, cutout_h = self.candidates[index]
+ else:
+ cutout_w = int(self.candidates[index][0] * w)
+ cutout_h = int(self.candidates[index][1] * h)
+
+ x2 = np.clip(x1 + cutout_w, 0, w)
+ y2 = np.clip(y1 + cutout_h, 0, h)
+ results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(n_holes={self.n_holes}, '
+ repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+ else f'cutout_shape={self.candidates}, ')
+ repr_str += f'fill_in={self.fill_in})'
+ return repr_str
diff --git a/mmcv/datasets/pipelines/transforms_3d.py b/mmcv/datasets/pipelines/transforms_3d.py
new file mode 100644
index 0000000..aa7ab25
--- /dev/null
+++ b/mmcv/datasets/pipelines/transforms_3d.py
@@ -0,0 +1,2042 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from numpy import random
+import warnings
+from mmcv import is_tuple_of
+from mmcv.utils import build_from_cfg
+from mmcv.parallel import DataContainer as DC
+
+from mmcv.core.voxel.voxel_generator import VoxelGenerator
+from mmcv.core.bbox.structures.cam_box3d import CameraInstance3DBoxes
+from mmcv.core.bbox.structures.depth_box3d import DepthInstance3DBoxes
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.core.bbox import box_np_ops
+from mmcv.datasets.builder import PIPELINES
+from mmcv.datasets.pipelines.transforms import RandomFlip
+from mmcv.image import impad, impad_to_multiple, imnormalize, imresize, bgr2hsv, hsv2bgr
+from ..builder import OBJECTSAMPLERS
+from .data_augment_utils import noise_per_object_v3_
+
+
+@PIPELINES.register_module()
+class RandomDropPointsColor(object):
+ r"""Randomly set the color of points to all zeros.
+
+ Once this transform is executed, all the points' color will be dropped.
+ Refer to `PAConv `_ for more details.
+
+ Args:
+ drop_ratio (float): The probability of dropping point colors.
+ Defaults to 0.2.
+ """
+
+ def __init__(self, drop_ratio=0.2):
+ assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
+ f'invalid drop_ratio value {drop_ratio}'
+ self.drop_ratio = drop_ratio
+
+ def __call__(self, input_dict):
+ """Call function to drop point colors.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after color dropping, \
+ 'points' key is updated in the result dict.
+ """
+ points = input_dict['points']
+ assert points.attribute_dims is not None and \
+ 'color' in points.attribute_dims, \
+ 'Expect points have color attribute'
+
+ # this if-expression is a bit strange
+ # `RandomDropPointsColor` is used in training 3D segmentor PAConv
+ # we discovered in our experiments that, using
+ # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
+ # better results than using `if np.random.rand() < self.drop_ratio`
+ # so we keep this hack in our codebase
+ if np.random.rand() > 1.0 - self.drop_ratio:
+ points.color = points.color * 0.0
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(drop_ratio={self.drop_ratio})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip3D(RandomFlip):
+ """Flip the points & bbox.
+
+ If the input dict contains the key "flip", then the flag will be used,
+ otherwise it will be randomly decided by a ratio specified in the init
+ method.
+
+ Args:
+ sync_2d (bool, optional): Whether to apply flip according to the 2D
+ images. If True, it will apply the same flip as that to 2D images.
+ If False, it will decide whether to flip randomly and independently
+ to that of 2D images. Defaults to True.
+ flip_ratio_bev_horizontal (float, optional): The flipping probability
+ in horizontal direction. Defaults to 0.0.
+ flip_ratio_bev_vertical (float, optional): The flipping probability
+ in vertical direction. Defaults to 0.0.
+ """
+
+ def __init__(self,
+ sync_2d=True,
+ flip_ratio_bev_horizontal=0.0,
+ flip_ratio_bev_vertical=0.0,
+ **kwargs):
+ super(RandomFlip3D, self).__init__(
+ flip_ratio=flip_ratio_bev_horizontal, **kwargs)
+ self.sync_2d = sync_2d
+ self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
+ if flip_ratio_bev_horizontal is not None:
+ assert isinstance(
+ flip_ratio_bev_horizontal,
+ (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
+ if flip_ratio_bev_vertical is not None:
+ assert isinstance(
+ flip_ratio_bev_vertical,
+ (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
+
+ def random_flip_data_3d(self, input_dict, direction='horizontal'):
+ """Flip 3D data randomly.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+ direction (str): Flip direction. Default: horizontal.
+
+ Returns:
+ dict: Flipped results, 'points', 'bbox3d_fields' keys are \
+ updated in the result dict.
+ """
+ assert direction in ['horizontal', 'vertical']
+ if len(input_dict['bbox3d_fields']) == 0: # test mode
+ input_dict['bbox3d_fields'].append('empty_box3d')
+ input_dict['empty_box3d'] = input_dict['box_type_3d'](
+ np.array([], dtype=np.float32))
+ assert len(input_dict['bbox3d_fields']) == 1
+ for key in input_dict['bbox3d_fields']:
+ if 'points' in input_dict:
+ input_dict['points'] = input_dict[key].flip(
+ direction, points=input_dict['points'])
+ else:
+ input_dict[key].flip(direction)
+ if 'centers2d' in input_dict:
+ assert self.sync_2d is True and direction == 'horizontal', \
+ 'Only support sync_2d=True and horizontal flip with images'
+ w = input_dict['ori_shape'][1]
+ input_dict['centers2d'][..., 0] = \
+ w - input_dict['centers2d'][..., 0]
+ # need to modify the horizontal position of camera center
+ # along u-axis in the image (flip like centers2d)
+ # ['cam2img'][0][2] = c_u
+ # see more details and examples at
+ # https://github.com/open-mmlab/mmcvection3d/pull/744
+ input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
+
+ def __call__(self, input_dict):
+ """Call function to flip points, values in the ``bbox3d_fields`` and \
+ also flip 2D image and its annotations.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Flipped results, 'flip', 'flip_direction', \
+ 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \
+ into result dict.
+ """
+ # filp 2D image and its annotations
+ super(RandomFlip3D, self).__call__(input_dict)
+
+ if self.sync_2d:
+ input_dict['pcd_horizontal_flip'] = input_dict['flip']
+ input_dict['pcd_vertical_flip'] = False
+ else:
+ if 'pcd_horizontal_flip' not in input_dict:
+ flip_horizontal = True if np.random.rand(
+ ) < self.flip_ratio else False
+ input_dict['pcd_horizontal_flip'] = flip_horizontal
+ if 'pcd_vertical_flip' not in input_dict:
+ flip_vertical = True if np.random.rand(
+ ) < self.flip_ratio_bev_vertical else False
+ input_dict['pcd_vertical_flip'] = flip_vertical
+
+ if 'transformation_3d_flow' not in input_dict:
+ input_dict['transformation_3d_flow'] = []
+
+ if input_dict['pcd_horizontal_flip']:
+ self.random_flip_data_3d(input_dict, 'horizontal')
+ input_dict['transformation_3d_flow'].extend(['HF'])
+ if input_dict['pcd_vertical_flip']:
+ self.random_flip_data_3d(input_dict, 'vertical')
+ input_dict['transformation_3d_flow'].extend(['VF'])
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(sync_2d={self.sync_2d},'
+ repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class RandomJitterPoints(object):
+ """Randomly jitter point coordinates.
+
+ Different from the global translation in ``GlobalRotScaleTrans``, here we \
+ apply different noises to each point in a scene.
+
+ Args:
+ jitter_std (list[float]): The standard deviation of jittering noise.
+ This applies random noise to all points in a 3D scene, which is \
+ sampled from a gaussian distribution whose standard deviation is \
+ set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
+ clip_range (list[float] | None): Clip the randomly generated jitter \
+ noise into this range. If None is given, don't perform clipping.
+ Defaults to [-0.05, 0.05]
+
+ Note:
+ This transform should only be used in point cloud segmentation tasks \
+ because we don't transform ground-truth bboxes accordingly.
+ For similar transform in detection task, please refer to `ObjectNoise`.
+ """
+
+ def __init__(self,
+ jitter_std=[0.01, 0.01, 0.01],
+ clip_range=[-0.05, 0.05]):
+ seq_types = (list, tuple, np.ndarray)
+ if not isinstance(jitter_std, seq_types):
+ assert isinstance(jitter_std, (int, float)), \
+ f'unsupported jitter_std type {type(jitter_std)}'
+ jitter_std = [jitter_std, jitter_std, jitter_std]
+ self.jitter_std = jitter_std
+
+ if clip_range is not None:
+ if not isinstance(clip_range, seq_types):
+ assert isinstance(clip_range, (int, float)), \
+ f'unsupported clip_range type {type(clip_range)}'
+ clip_range = [-clip_range, clip_range]
+ self.clip_range = clip_range
+
+ def __call__(self, input_dict):
+ """Call function to jitter all the points in the scene.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after adding noise to each point, \
+ 'points' key is updated in the result dict.
+ """
+ points = input_dict['points']
+ jitter_std = np.array(self.jitter_std, dtype=np.float32)
+ jitter_noise = \
+ np.random.randn(points.shape[0], 3) * jitter_std[None, :]
+ if self.clip_range is not None:
+ jitter_noise = np.clip(jitter_noise, self.clip_range[0],
+ self.clip_range[1])
+
+ points.translate(jitter_noise)
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(jitter_std={self.jitter_std},'
+ repr_str += f' clip_range={self.clip_range})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectSample(object):
+ """Sample GT objects to the data.
+
+ Args:
+ db_sampler (dict): Config dict of the database sampler.
+ sample_2d (bool): Whether to also paste 2D image patch to the images
+ This should be true when applying multi-modality cut-and-paste.
+ Defaults to False.
+ """
+
+ def __init__(self, db_sampler, sample_2d=False):
+ self.sampler_cfg = db_sampler
+ self.sample_2d = sample_2d
+ if 'type' not in db_sampler.keys():
+ db_sampler['type'] = 'DataBaseSampler'
+ self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
+
+ @staticmethod
+ def remove_points_in_boxes(points, boxes):
+ """Remove the points in the sampled bounding boxes.
+
+ Args:
+ points (:obj:`BasePoints`): Input point cloud array.
+ boxes (np.ndarray): Sampled ground truth boxes.
+
+ Returns:
+ np.ndarray: Points with those in the boxes removed.
+ """
+ masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
+ points = points[np.logical_not(masks.any(-1))]
+ return points
+
+ def __call__(self, input_dict):
+ """Call function to sample ground truth objects to the data.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after object sampling augmentation, \
+ 'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated \
+ in the result dict.
+ """
+ gt_bboxes_3d = input_dict['gt_bboxes_3d']
+ gt_labels_3d = input_dict['gt_labels_3d']
+
+ # change to float for blending operation
+ points = input_dict['points']
+ if self.sample_2d:
+ img = input_dict['img']
+ gt_bboxes_2d = input_dict['gt_bboxes']
+ # Assume for now 3D & 2D bboxes are the same
+ sampled_dict = self.db_sampler.sample_all(
+ gt_bboxes_3d.tensor.numpy(),
+ gt_labels_3d,
+ gt_bboxes_2d=gt_bboxes_2d,
+ img=img)
+ else:
+ sampled_dict = self.db_sampler.sample_all(
+ gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None)
+
+ if sampled_dict is not None:
+ sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
+ sampled_points = sampled_dict['points']
+ sampled_gt_labels = sampled_dict['gt_labels_3d']
+
+ gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
+ axis=0)
+ gt_bboxes_3d = gt_bboxes_3d.new_box(
+ np.concatenate(
+ [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
+
+ points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
+ # check the points dimension
+ points = points.cat([sampled_points, points])
+
+ if self.sample_2d:
+ sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
+ gt_bboxes_2d = np.concatenate(
+ [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
+
+ input_dict['gt_bboxes'] = gt_bboxes_2d
+ input_dict['img'] = sampled_dict['img']
+
+ input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+ input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.long)
+ input_dict['points'] = points
+
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f' sample_2d={self.sample_2d},'
+ repr_str += f' data_root={self.sampler_cfg.data_root},'
+ repr_str += f' info_path={self.sampler_cfg.info_path},'
+ repr_str += f' rate={self.sampler_cfg.rate},'
+ repr_str += f' prepare={self.sampler_cfg.prepare},'
+ repr_str += f' classes={self.sampler_cfg.classes},'
+ repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNoise(object):
+ """Apply noise to each GT objects in the scene.
+
+ Args:
+ translation_std (list[float], optional): Standard deviation of the
+ distribution where translation noise are sampled from.
+ Defaults to [0.25, 0.25, 0.25].
+ global_rot_range (list[float], optional): Global rotation to the scene.
+ Defaults to [0.0, 0.0].
+ rot_range (list[float], optional): Object rotation range.
+ Defaults to [-0.15707963267, 0.15707963267].
+ num_try (int, optional): Number of times to try if the noise applied is
+ invalid. Defaults to 100.
+ """
+
+ def __init__(self,
+ translation_std=[0.25, 0.25, 0.25],
+ global_rot_range=[0.0, 0.0],
+ rot_range=[-0.15707963267, 0.15707963267],
+ num_try=100):
+ self.translation_std = translation_std
+ self.global_rot_range = global_rot_range
+ self.rot_range = rot_range
+ self.num_try = num_try
+
+ def __call__(self, input_dict):
+ """Call function to apply noise to each ground truth in the scene.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after adding noise to each object, \
+ 'points', 'gt_bboxes_3d' keys are updated in the result dict.
+ """
+ gt_bboxes_3d = input_dict['gt_bboxes_3d']
+ points = input_dict['points']
+
+ # TODO: check this inplace function
+ numpy_box = gt_bboxes_3d.tensor.numpy()
+ numpy_points = points.tensor.numpy()
+
+ noise_per_object_v3_(
+ numpy_box,
+ numpy_points,
+ rotation_perturb=self.rot_range,
+ center_noise_std=self.translation_std,
+ global_random_rot_range=self.global_rot_range,
+ num_try=self.num_try)
+
+ input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
+ input_dict['points'] = points.new_point(numpy_points)
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(num_try={self.num_try},'
+ repr_str += f' translation_std={self.translation_std},'
+ repr_str += f' global_rot_range={self.global_rot_range},'
+ repr_str += f' rot_range={self.rot_range})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalAlignment(object):
+ """Apply global alignment to 3D scene points by rotation and translation.
+
+ Args:
+ rotation_axis (int): Rotation axis for points and bboxes rotation.
+
+ Note:
+ We do not record the applied rotation and translation as in \
+ GlobalRotScaleTrans. Because usually, we do not need to reverse \
+ the alignment step.
+ For example, ScanNet 3D detection task uses aligned ground-truth \
+ bounding boxes for evaluation.
+ """
+
+ def __init__(self, rotation_axis):
+ self.rotation_axis = rotation_axis
+
+ def _trans_points(self, input_dict, trans_factor):
+ """Private function to translate points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+ trans_factor (np.ndarray): Translation vector to be applied.
+
+ Returns:
+ dict: Results after translation, 'points' is updated in the dict.
+ """
+ input_dict['points'].translate(trans_factor)
+
+ def _rot_points(self, input_dict, rot_mat):
+ """Private function to rotate bounding boxes and points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+ rot_mat (np.ndarray): Rotation matrix to be applied.
+
+ Returns:
+ dict: Results after rotation, 'points' is updated in the dict.
+ """
+ # input should be rot_mat_T so I transpose it here
+ input_dict['points'].rotate(rot_mat.T)
+
+ def _check_rot_mat(self, rot_mat):
+ """Check if rotation matrix is valid for self.rotation_axis.
+
+ Args:
+ rot_mat (np.ndarray): Rotation matrix to be applied.
+ """
+ is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
+ valid_array = np.zeros(3)
+ valid_array[self.rotation_axis] = 1.0
+ is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
+ is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
+ assert is_valid, f'invalid rotation matrix {rot_mat}'
+
+ def __call__(self, input_dict):
+ """Call function to shuffle points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after global alignment, 'points' and keys in \
+ input_dict['bbox3d_fields'] are updated in the result dict.
+ """
+ assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \
+ 'axis_align_matrix is not provided in GlobalAlignment'
+
+ axis_align_matrix = input_dict['ann_info']['axis_align_matrix']
+ assert axis_align_matrix.shape == (4, 4), \
+ f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
+ rot_mat = axis_align_matrix[:3, :3]
+ trans_vec = axis_align_matrix[:3, -1]
+
+ self._check_rot_mat(rot_mat)
+ self._rot_points(input_dict, rot_mat)
+ self._trans_points(input_dict, trans_vec)
+
+ return input_dict
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(rotation_axis={self.rotation_axis})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalRotScaleTrans(object):
+ """Apply global rotation, scaling and translation to a 3D scene.
+
+ Args:
+ rot_range (list[float]): Range of rotation angle.
+ Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
+ scale_ratio_range (list[float]): Range of scale ratio.
+ Defaults to [0.95, 1.05].
+ translation_std (list[float]): The standard deviation of translation
+ noise. This applies random translation to a scene by a noise, which
+ is sampled from a gaussian distribution whose standard deviation
+ is set by ``translation_std``. Defaults to [0, 0, 0]
+ shift_height (bool): Whether to shift height.
+ (the fourth dimension of indoor points) when scaling.
+ Defaults to False.
+ """
+
+ def __init__(self,
+ rot_range=[-0.78539816, 0.78539816],
+ scale_ratio_range=[0.95, 1.05],
+ translation_std=[0, 0, 0],
+ shift_height=False):
+ seq_types = (list, tuple, np.ndarray)
+ if not isinstance(rot_range, seq_types):
+ assert isinstance(rot_range, (int, float)), \
+ f'unsupported rot_range type {type(rot_range)}'
+ rot_range = [-rot_range, rot_range]
+ self.rot_range = rot_range
+
+ assert isinstance(scale_ratio_range, seq_types), \
+ f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
+ self.scale_ratio_range = scale_ratio_range
+
+ if not isinstance(translation_std, seq_types):
+ assert isinstance(translation_std, (int, float)), \
+ f'unsupported translation_std type {type(translation_std)}'
+ translation_std = [
+ translation_std, translation_std, translation_std
+ ]
+ assert all([std >= 0 for std in translation_std]), \
+ 'translation_std should be positive'
+ self.translation_std = translation_std
+ self.shift_height = shift_height
+
+ def _trans_bbox_points(self, input_dict):
+ """Private function to translate bounding boxes and points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after translation, 'points', 'pcd_trans' \
+ and keys in input_dict['bbox3d_fields'] are updated \
+ in the result dict.
+ """
+ translation_std = np.array(self.translation_std, dtype=np.float32)
+ trans_factor = np.random.normal(scale=translation_std, size=3).T
+
+ input_dict['points'].translate(trans_factor)
+ input_dict['pcd_trans'] = trans_factor
+ for key in input_dict['bbox3d_fields']:
+ input_dict[key].translate(trans_factor)
+
+ def _rot_bbox_points(self, input_dict):
+ """Private function to rotate bounding boxes and points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after rotation, 'points', 'pcd_rotation' \
+ and keys in input_dict['bbox3d_fields'] are updated \
+ in the result dict.
+ """
+ rotation = self.rot_range
+ noise_rotation = np.random.uniform(rotation[0], rotation[1])
+
+ # if no bbox in input_dict, only rotate points
+ if len(input_dict['bbox3d_fields']) == 0:
+ rot_mat_T = input_dict['points'].rotate(noise_rotation)
+ input_dict['pcd_rotation'] = rot_mat_T
+ return
+
+ # rotate points with bboxes
+ for key in input_dict['bbox3d_fields']:
+ if len(input_dict[key].tensor) != 0:
+ points, rot_mat_T = input_dict[key].rotate(
+ noise_rotation, input_dict['points'])
+ input_dict['points'] = points
+ input_dict['pcd_rotation'] = rot_mat_T
+
+ def _scale_bbox_points(self, input_dict):
+ """Private function to scale bounding boxes and points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after scaling, 'points'and keys in \
+ input_dict['bbox3d_fields'] are updated in the result dict.
+ """
+ scale = input_dict['pcd_scale_factor']
+ points = input_dict['points']
+ points.scale(scale)
+ if self.shift_height:
+ assert 'height' in points.attribute_dims.keys(), \
+ 'setting shift_height=True but points have no height attribute'
+ points.tensor[:, points.attribute_dims['height']] *= scale
+ input_dict['points'] = points
+
+ for key in input_dict['bbox3d_fields']:
+ input_dict[key].scale(scale)
+
+ def _random_scale(self, input_dict):
+ """Private function to randomly set the scale factor.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after scaling, 'pcd_scale_factor' are updated \
+ in the result dict.
+ """
+ scale_factor = np.random.uniform(self.scale_ratio_range[0],
+ self.scale_ratio_range[1])
+ input_dict['pcd_scale_factor'] = scale_factor
+
+ def __call__(self, input_dict):
+ """Private function to rotate, scale and translate bounding boxes and \
+ points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after scaling, 'points', 'pcd_rotation',
+ 'pcd_scale_factor', 'pcd_trans' and keys in \
+ input_dict['bbox3d_fields'] are updated in the result dict.
+ """
+ if 'transformation_3d_flow' not in input_dict:
+ input_dict['transformation_3d_flow'] = []
+
+ self._rot_bbox_points(input_dict)
+
+ if 'pcd_scale_factor' not in input_dict:
+ self._random_scale(input_dict)
+ self._scale_bbox_points(input_dict)
+
+ self._trans_bbox_points(input_dict)
+
+ input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(rot_range={self.rot_range},'
+ repr_str += f' scale_ratio_range={self.scale_ratio_range},'
+ repr_str += f' translation_std={self.translation_std},'
+ repr_str += f' shift_height={self.shift_height})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class PointShuffle(object):
+ """Shuffle input points."""
+
+ def __call__(self, input_dict):
+ """Call function to shuffle points.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ idx = input_dict['points'].shuffle()
+ idx = idx.numpy()
+
+ pts_instance_mask = input_dict.get('pts_instance_mask', None)
+ pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+ if pts_instance_mask is not None:
+ input_dict['pts_instance_mask'] = pts_instance_mask[idx]
+
+ if pts_semantic_mask is not None:
+ input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
+
+ return input_dict
+
+ def __repr__(self):
+ return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class ObjectRangeFilter(object):
+ """Filter objects by the range.
+
+ Args:
+ point_cloud_range (list[float]): Point cloud range.
+ """
+
+ def __init__(self, point_cloud_range):
+ self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+ def __call__(self, input_dict):
+ """Call function to filter objects by the range.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+ keys are updated in the result dict.
+ """
+ # Check points instance type and initialise bev_range
+ if isinstance(input_dict['gt_bboxes_3d'],
+ (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+ bev_range = self.pcd_range[[0, 1, 3, 4]]
+ elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+ bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+ gt_bboxes_3d = input_dict['gt_bboxes_3d']
+ gt_labels_3d = input_dict['gt_labels_3d']
+ mask = gt_bboxes_3d.in_range_bev(bev_range)
+ gt_bboxes_3d = gt_bboxes_3d[mask]
+ # mask is a torch tensor but gt_labels_3d is still numpy array
+ # using mask to index gt_labels_3d will cause bug when
+ # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+ # as gt_labels_3d[1] and cause out of index error
+ gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+ # limit rad to [-pi, pi]
+ gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+ input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+ input_dict['gt_labels_3d'] = gt_labels_3d
+
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class PointsRangeFilter(object):
+ """Filter points by the range.
+
+ Args:
+ point_cloud_range (list[float]): Point cloud range.
+ """
+
+ def __init__(self, point_cloud_range):
+ self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+ def __call__(self, input_dict):
+ """Call function to filter points by the range.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ points = input_dict['points']
+ points_mask = points.in_range_3d(self.pcd_range)
+ clean_points = points[points_mask]
+ input_dict['points'] = clean_points
+ points_mask = points_mask.numpy()
+
+ pts_instance_mask = input_dict.get('pts_instance_mask', None)
+ pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+ if pts_instance_mask is not None:
+ input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+ if pts_semantic_mask is not None:
+ input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNameFilter(object):
+ """Filter GT objects by their names.
+
+ Args:
+ classes (list[str]): List of class names to be kept for training.
+ """
+
+ def __init__(self, classes):
+ self.classes = classes
+ self.labels = list(range(len(self.classes)))
+
+ def __call__(self, input_dict):
+ """Call function to filter objects by their names.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+ keys are updated in the result dict.
+ """
+ gt_labels_3d = input_dict['gt_labels_3d']
+ gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+ dtype=np.bool_)
+ input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+ input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(classes={self.classes})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class PointSample(object):
+ """Point sample.
+
+ Sampling data to a certain number.
+
+ Args:
+ num_points (int): Number of points to be sampled.
+ sample_range (float, optional): The range where to sample points.
+ If not None, the points with depth larger than `sample_range` are
+ prior to be sampled. Defaults to None.
+ replace (bool, optional): Whether the sampling is with or without
+ replacement. Defaults to False.
+ """
+
+ def __init__(self, num_points, sample_range=None, replace=False):
+ self.num_points = num_points
+ self.sample_range = sample_range
+ self.replace = replace
+
+ def _points_random_sampling(self,
+ points,
+ num_samples,
+ sample_range=None,
+ replace=False,
+ return_choices=False):
+ """Points random sampling.
+
+ Sample points to a certain number.
+
+ Args:
+ points (np.ndarray | :obj:`BasePoints`): 3D Points.
+ num_samples (int): Number of samples to be sampled.
+ sample_range (float, optional): Indicating the range where the
+ points will be sampled. Defaults to None.
+ replace (bool, optional): Sampling with or without replacement.
+ Defaults to None.
+ return_choices (bool, optional): Whether return choice.
+ Defaults to False.
+ Returns:
+ tuple[np.ndarray] | np.ndarray:
+ - points (np.ndarray | :obj:`BasePoints`): 3D Points.
+ - choices (np.ndarray, optional): The generated random samples.
+ """
+ if not replace:
+ replace = (points.shape[0] < num_samples)
+ point_range = range(len(points))
+ if sample_range is not None and not replace:
+ # Only sampling the near points when len(points) >= num_samples
+ depth = np.linalg.norm(points.tensor, axis=1)
+ far_inds = np.where(depth > sample_range)[0]
+ near_inds = np.where(depth <= sample_range)[0]
+ # in case there are too many far points
+ if len(far_inds) > num_samples:
+ far_inds = np.random.choice(
+ far_inds, num_samples, replace=False)
+ point_range = near_inds
+ num_samples -= len(far_inds)
+ choices = np.random.choice(point_range, num_samples, replace=replace)
+ if sample_range is not None and not replace:
+ choices = np.concatenate((far_inds, choices))
+ # Shuffle points after sampling
+ np.random.shuffle(choices)
+ if return_choices:
+ return points[choices], choices
+ else:
+ return points[choices]
+
+ def __call__(self, results):
+ """Call function to sample points to in indoor scenes.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Results after sampling, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ points = results['points']
+ # Points in Camera coord can provide the depth information.
+ # TODO: Need to suport distance-based sampling for other coord system.
+ if self.sample_range is not None:
+ from mmcv.core.points import CameraPoints
+ assert isinstance(points, CameraPoints), \
+ 'Sampling based on distance is only appliable for CAMERA coord'
+ points, choices = self._points_random_sampling(
+ points,
+ self.num_points,
+ self.sample_range,
+ self.replace,
+ return_choices=True)
+ results['points'] = points
+
+ pts_instance_mask = results.get('pts_instance_mask', None)
+ pts_semantic_mask = results.get('pts_semantic_mask', None)
+
+ if pts_instance_mask is not None:
+ pts_instance_mask = pts_instance_mask[choices]
+ results['pts_instance_mask'] = pts_instance_mask
+
+ if pts_semantic_mask is not None:
+ pts_semantic_mask = pts_semantic_mask[choices]
+ results['pts_semantic_mask'] = pts_semantic_mask
+
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(num_points={self.num_points},'
+ repr_str += f' sample_range={self.sample_range},'
+ repr_str += f' replace={self.replace})'
+
+ return repr_str
+
+
+@PIPELINES.register_module()
+class IndoorPointSample(PointSample):
+ """Indoor point sample.
+
+ Sampling data to a certain number.
+ NOTE: IndoorPointSample is deprecated in favor of PointSample
+
+ Args:
+ num_points (int): Number of points to be sampled.
+ """
+
+ def __init__(self, *args, **kwargs):
+ warnings.warn(
+ 'IndoorPointSample is deprecated in favor of PointSample')
+ super(IndoorPointSample, self).__init__(*args, **kwargs)
+
+
+@PIPELINES.register_module()
+class IndoorPatchPointSample(object):
+ r"""Indoor point sample within a patch. Modified from `PointNet++ `_.
+
+ Sampling data to a certain number for semantic segmentation.
+
+ Args:
+ num_points (int): Number of points to be sampled.
+ block_size (float, optional): Size of a block to sample points from.
+ Defaults to 1.5.
+ sample_rate (float, optional): Stride used in sliding patch generation.
+ This parameter is unused in `IndoorPatchPointSample` and thus has
+ been deprecated. We plan to remove it in the future.
+ Defaults to None.
+ ignore_index (int, optional): Label index that won't be used for the
+ segmentation task. This is set in PointSegClassMapping as neg_cls.
+ If not None, will be used as a patch selection criterion.
+ Defaults to None.
+ use_normalized_coord (bool, optional): Whether to use normalized xyz as
+ additional features. Defaults to False.
+ num_try (int, optional): Number of times to try if the patch selected
+ is invalid. Defaults to 10.
+ enlarge_size (float | None, optional): Enlarge the sampled patch to
+ [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
+ an augmentation. If None, set it as 0. Defaults to 0.2.
+ min_unique_num (int | None, optional): Minimum number of unique points
+ the sampled patch should contain. If None, use PointNet++'s method
+ to judge uniqueness. Defaults to None.
+ eps (float, optional): A value added to patch boundary to guarantee
+ points coverage. Defaults to 1e-2.
+
+ Note:
+ This transform should only be used in the training process of point
+ cloud segmentation tasks. For the sliding patch generation and
+ inference process in testing, please refer to the `slide_inference`
+ function of `EncoderDecoder3D` class.
+ """
+
+ def __init__(self,
+ num_points,
+ block_size=1.5,
+ sample_rate=None,
+ ignore_index=None,
+ use_normalized_coord=False,
+ num_try=10,
+ enlarge_size=0.2,
+ min_unique_num=None,
+ eps=1e-2):
+ self.num_points = num_points
+ self.block_size = block_size
+ self.ignore_index = ignore_index
+ self.use_normalized_coord = use_normalized_coord
+ self.num_try = num_try
+ self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
+ self.min_unique_num = min_unique_num
+ self.eps = eps
+
+ if sample_rate is not None:
+ warnings.warn(
+ "'sample_rate' has been deprecated and will be removed in "
+ 'the future. Please remove them from your code.')
+
+ def _input_generation(self, coords, patch_center, coord_max, attributes,
+ attribute_dims, point_type):
+ """Generating model input.
+
+ Generate input by subtracting patch center and adding additional \
+ features. Currently support colors and normalized xyz as features.
+
+ Args:
+ coords (np.ndarray): Sampled 3D Points.
+ patch_center (np.ndarray): Center coordinate of the selected patch.
+ coord_max (np.ndarray): Max coordinate of all 3D Points.
+ attributes (np.ndarray): features of input points.
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
+ dimension.
+ point_type (type): class of input points inherited from BasePoints.
+
+ Returns:
+ :obj:`BasePoints`: The generated input data.
+ """
+ # subtract patch center, the z dimension is not centered
+ centered_coords = coords.copy()
+ centered_coords[:, 0] -= patch_center[0]
+ centered_coords[:, 1] -= patch_center[1]
+
+ if self.use_normalized_coord:
+ normalized_coord = coords / coord_max
+ attributes = np.concatenate([attributes, normalized_coord], axis=1)
+ if attribute_dims is None:
+ attribute_dims = dict()
+ attribute_dims.update(
+ dict(normalized_coord=[
+ attributes.shape[1], attributes.shape[1] +
+ 1, attributes.shape[1] + 2
+ ]))
+
+ points = np.concatenate([centered_coords, attributes], axis=1)
+ points = point_type(
+ points, points_dim=points.shape[1], attribute_dims=attribute_dims)
+
+ return points
+
+ def _patch_points_sampling(self, points, sem_mask):
+ """Patch points sampling.
+
+ First sample a valid patch.
+ Then sample points within that patch to a certain number.
+
+ Args:
+ points (:obj:`BasePoints`): 3D Points.
+ sem_mask (np.ndarray): semantic segmentation mask for input points.
+
+ Returns:
+ tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+
+ - points (:obj:`BasePoints`): 3D Points.
+ - choices (np.ndarray): The generated random samples.
+ """
+ coords = points.coord.numpy()
+ attributes = points.tensor[:, 3:].numpy()
+ attribute_dims = points.attribute_dims
+ point_type = type(points)
+
+ coord_max = np.amax(coords, axis=0)
+ coord_min = np.amin(coords, axis=0)
+
+ for _ in range(self.num_try):
+ # random sample a point as patch center
+ cur_center = coords[np.random.choice(coords.shape[0])]
+
+ # boundary of a patch, which would be enlarged by
+ # `self.enlarge_size` as an augmentation
+ cur_max = cur_center + np.array(
+ [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+ cur_min = cur_center - np.array(
+ [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+ cur_max[2] = coord_max[2]
+ cur_min[2] = coord_min[2]
+ cur_choice = np.sum(
+ (coords >= (cur_min - self.enlarge_size)) *
+ (coords <= (cur_max + self.enlarge_size)),
+ axis=1) == 3
+
+ if not cur_choice.any(): # no points in this patch
+ continue
+
+ cur_coords = coords[cur_choice, :]
+ cur_sem_mask = sem_mask[cur_choice]
+ point_idxs = np.where(cur_choice)[0]
+ mask = np.sum(
+ (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
+ (cur_max + self.eps)),
+ axis=1) == 3
+
+ # two criteria for patch sampling, adopted from PointNet++
+ # 1. selected patch should contain enough unique points
+ if self.min_unique_num is None:
+ # use PointNet++'s method as default
+ # [31, 31, 62] are just some big values used to transform
+ # coords from 3d array to 1d and then check their uniqueness
+ # this is used in all the ScanNet code following PointNet++
+ vidx = np.ceil(
+ (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
+ np.array([31.0, 31.0, 62.0]))
+ vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
+ vidx[:, 2])
+ flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
+ else:
+ # if `min_unique_num` is provided, directly compare with it
+ flag1 = mask.sum() >= self.min_unique_num
+
+ # 2. selected patch should contain enough annotated points
+ if self.ignore_index is None:
+ flag2 = True
+ else:
+ flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
+ len(cur_sem_mask) >= 0.7
+
+ if flag1 and flag2:
+ break
+
+ # sample idx to `self.num_points`
+ if point_idxs.size >= self.num_points:
+ # no duplicate in sub-sampling
+ choices = np.random.choice(
+ point_idxs, self.num_points, replace=False)
+ else:
+ # do not use random choice here to avoid some points not counted
+ dup = np.random.choice(point_idxs.size,
+ self.num_points - point_idxs.size)
+ idx_dup = np.concatenate(
+ [np.arange(point_idxs.size),
+ np.array(dup)], 0)
+ choices = point_idxs[idx_dup]
+
+ # construct model input
+ points = self._input_generation(coords[choices], cur_center, coord_max,
+ attributes[choices], attribute_dims,
+ point_type)
+
+ return points, choices
+
+ def __call__(self, results):
+ """Call function to sample points to in indoor scenes.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after sampling, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ points = results['points']
+
+ assert 'pts_semantic_mask' in results.keys(), \
+ 'semantic mask should be provided in training and evaluation'
+ pts_semantic_mask = results['pts_semantic_mask']
+
+ points, choices = self._patch_points_sampling(points,
+ pts_semantic_mask)
+
+ results['points'] = points
+ results['pts_semantic_mask'] = pts_semantic_mask[choices]
+ pts_instance_mask = results.get('pts_instance_mask', None)
+ if pts_instance_mask is not None:
+ results['pts_instance_mask'] = pts_instance_mask[choices]
+
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(num_points={self.num_points},'
+ repr_str += f' block_size={self.block_size},'
+ repr_str += f' ignore_index={self.ignore_index},'
+ repr_str += f' use_normalized_coord={self.use_normalized_coord},'
+ repr_str += f' num_try={self.num_try},'
+ repr_str += f' enlarge_size={self.enlarge_size},'
+ repr_str += f' min_unique_num={self.min_unique_num},'
+ repr_str += f' eps={self.eps})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class BackgroundPointsFilter(object):
+ """Filter background points near the bounding box.
+
+ Args:
+ bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
+ """
+
+ def __init__(self, bbox_enlarge_range):
+ assert (is_tuple_of(bbox_enlarge_range, float)
+ and len(bbox_enlarge_range) == 3) \
+ or isinstance(bbox_enlarge_range, float), \
+ f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
+
+ if isinstance(bbox_enlarge_range, float):
+ bbox_enlarge_range = [bbox_enlarge_range] * 3
+ self.bbox_enlarge_range = np.array(
+ bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
+
+ def __call__(self, input_dict):
+ """Call function to filter points by the range.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ points = input_dict['points']
+ gt_bboxes_3d = input_dict['gt_bboxes_3d']
+
+ # avoid groundtruth being modified
+ gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
+ gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
+
+ enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
+ enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
+ points_numpy = points.tensor.clone().numpy()
+ foreground_masks = box_np_ops.points_in_rbbox(
+ points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
+ enlarge_foreground_masks = box_np_ops.points_in_rbbox(
+ points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
+ foreground_masks = foreground_masks.max(1)
+ enlarge_foreground_masks = enlarge_foreground_masks.max(1)
+ valid_masks = ~np.logical_and(~foreground_masks,
+ enlarge_foreground_masks)
+
+ input_dict['points'] = points[valid_masks]
+ pts_instance_mask = input_dict.get('pts_instance_mask', None)
+ if pts_instance_mask is not None:
+ input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
+
+ pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+ if pts_semantic_mask is not None:
+ input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class VoxelBasedPointSampler(object):
+ """Voxel based point sampler.
+
+ Apply voxel sampling to multiple sweep points.
+
+ Args:
+ cur_sweep_cfg (dict): Config for sampling current points.
+ prev_sweep_cfg (dict): Config for sampling previous points.
+ time_dim (int): Index that indicate the time dimention
+ for input points.
+ """
+
+ def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
+ self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
+ self.cur_voxel_num = self.cur_voxel_generator._max_voxels
+ self.time_dim = time_dim
+ if prev_sweep_cfg is not None:
+ assert prev_sweep_cfg['max_num_points'] == \
+ cur_sweep_cfg['max_num_points']
+ self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
+ self.prev_voxel_num = self.prev_voxel_generator._max_voxels
+ else:
+ self.prev_voxel_generator = None
+ self.prev_voxel_num = 0
+
+ def _sample_points(self, points, sampler, point_dim):
+ """Sample points for each points subset.
+
+ Args:
+ points (np.ndarray): Points subset to be sampled.
+ sampler (VoxelGenerator): Voxel based sampler for
+ each points subset.
+ point_dim (int): The dimention of each points
+
+ Returns:
+ np.ndarray: Sampled points.
+ """
+ voxels, coors, num_points_per_voxel = sampler.generate(points)
+ if voxels.shape[0] < sampler._max_voxels:
+ padding_points = np.zeros([
+ sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
+ point_dim
+ ],
+ dtype=points.dtype)
+ padding_points[:] = voxels[0]
+ sample_points = np.concatenate([voxels, padding_points], axis=0)
+ else:
+ sample_points = voxels
+
+ return sample_points
+
+ def __call__(self, results):
+ """Call function to sample points from multiple sweeps.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after sampling, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ points = results['points']
+ original_dim = points.shape[1]
+
+ # TODO: process instance and semantic mask while _max_num_points
+ # is larger than 1
+ # Extend points with seg and mask fields
+ map_fields2dim = []
+ start_dim = original_dim
+ points_numpy = points.tensor.numpy()
+ extra_channel = [points_numpy]
+ for idx, key in enumerate(results['pts_mask_fields']):
+ map_fields2dim.append((key, idx + start_dim))
+ extra_channel.append(results[key][..., None])
+
+ start_dim += len(results['pts_mask_fields'])
+ for idx, key in enumerate(results['pts_seg_fields']):
+ map_fields2dim.append((key, idx + start_dim))
+ extra_channel.append(results[key][..., None])
+
+ points_numpy = np.concatenate(extra_channel, axis=-1)
+
+ # Split points into two part, current sweep points and
+ # previous sweeps points.
+ # TODO: support different sampling methods for next sweeps points
+ # and previous sweeps points.
+ cur_points_flag = (points_numpy[:, self.time_dim] == 0)
+ cur_sweep_points = points_numpy[cur_points_flag]
+ prev_sweeps_points = points_numpy[~cur_points_flag]
+ if prev_sweeps_points.shape[0] == 0:
+ prev_sweeps_points = cur_sweep_points
+
+ # Shuffle points before sampling
+ np.random.shuffle(cur_sweep_points)
+ np.random.shuffle(prev_sweeps_points)
+
+ cur_sweep_points = self._sample_points(cur_sweep_points,
+ self.cur_voxel_generator,
+ points_numpy.shape[1])
+ if self.prev_voxel_generator is not None:
+ prev_sweeps_points = self._sample_points(prev_sweeps_points,
+ self.prev_voxel_generator,
+ points_numpy.shape[1])
+
+ points_numpy = np.concatenate(
+ [cur_sweep_points, prev_sweeps_points], 0)
+ else:
+ points_numpy = cur_sweep_points
+
+ if self.cur_voxel_generator._max_num_points == 1:
+ points_numpy = points_numpy.squeeze(1)
+ results['points'] = points.new_point(points_numpy[..., :original_dim])
+
+ # Restore the correspoinding seg and mask fields
+ for key, dim_index in map_fields2dim:
+ results[key] = points_numpy[..., dim_index]
+
+ return results
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+
+ def _auto_indent(repr_str, indent):
+ repr_str = repr_str.split('\n')
+ repr_str = [' ' * indent + t + '\n' for t in repr_str]
+ repr_str = ''.join(repr_str)[:-1]
+ return repr_str
+
+ repr_str = self.__class__.__name__
+ indent = 4
+ repr_str += '(\n'
+ repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
+ repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
+ repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
+ repr_str += ' ' * indent + 'cur_voxel_generator=\n'
+ repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
+ repr_str += ' ' * indent + 'prev_voxel_generator=\n'
+ repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
+ return repr_str
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+ """Pad the multi-view image.
+ There are two padding modes: (1) pad to a fixed size and (2) pad to the
+ minimum size that is divisible by some number.
+ Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+ Args:
+ size (tuple, optional): Fixed padding size.
+ size_divisor (int, optional): The divisor of padded size.
+ pad_val (float, optional): Padding value, 0 by default.
+ """
+
+ def __init__(self, size=None, size_divisor=None, pad_val=0):
+ self.size = size
+ self.size_divisor = size_divisor
+ self.pad_val = pad_val
+ # only one of size and size_divisor should be valid
+ assert size is not None or size_divisor is not None
+ assert size is None or size_divisor is None
+
+ def _pad_img(self, results):
+ """Pad images according to ``self.size``."""
+ if self.size is not None:
+ padded_img = [impad(
+ img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
+ elif self.size_divisor is not None:
+ padded_img = [impad_to_multiple(
+ img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
+
+ results['ori_shape'] = [img.shape for img in results['img']]
+ results['img'] = padded_img
+ results['img_shape'] = [img.shape for img in padded_img]
+ results['pad_shape'] = [img.shape for img in padded_img]
+ results['pad_fixed_size'] = self.size
+ results['pad_size_divisor'] = self.size_divisor
+
+ def __call__(self, results):
+ """Call function to pad images, masks, semantic segmentation maps.
+ Args:
+ results (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Updated result dict.
+ """
+ self._pad_img(results)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(size={self.size}, '
+ repr_str += f'size_divisor={self.size_divisor}, '
+ repr_str += f'pad_val={self.pad_val})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+ """Normalize the image.
+ Added key is "img_norm_cfg".
+ Args:
+ mean (sequence): Mean values of 3 channels.
+ std (sequence): Std values of 3 channels.
+ to_rgb (bool): Whether to convert the image from BGR to RGB,
+ default is true.
+ """
+
+ def __init__(self, mean, std, to_rgb=True):
+ self.mean = np.array(mean, dtype=np.float32)
+ self.std = np.array(std, dtype=np.float32)
+ self.to_rgb = to_rgb
+
+
+ def __call__(self, results):
+ """Call function to normalize images.
+ Args:
+ results (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Normalized results, 'img_norm_cfg' key is added into
+ result dict.
+ """
+
+ results['img'] = [imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
+ results['img_norm_cfg'] = dict(
+ mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+ """Apply photometric distortion to image sequentially, every transformation
+ is applied with a probability of 0.5. The position of random contrast is in
+ second or second to last.
+ 1. random brightness
+ 2. random contrast (mode 0)
+ 3. convert color from BGR to HSV
+ 4. random saturation
+ 5. random hue
+ 6. convert color from HSV to BGR
+ 7. random contrast (mode 1)
+ 8. randomly swap channels
+ Args:
+ brightness_delta (int): delta of brightness.
+ contrast_range (tuple): range of contrast.
+ saturation_range (tuple): range of saturation.
+ hue_delta (int): delta of hue.
+ """
+
+ def __init__(self,
+ brightness_delta=32,
+ contrast_range=(0.5, 1.5),
+ saturation_range=(0.5, 1.5),
+ hue_delta=18):
+ self.brightness_delta = brightness_delta
+ self.contrast_lower, self.contrast_upper = contrast_range
+ self.saturation_lower, self.saturation_upper = saturation_range
+ self.hue_delta = hue_delta
+
+ def __call__(self, results):
+ """Call function to perform photometric distortion on images.
+ Args:
+ results (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Result dict with images distorted.
+ """
+ imgs = results['img']
+ new_imgs = []
+ for img in imgs:
+ assert img.dtype == np.float32, \
+ 'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+ ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+ # random brightness
+ if random.randint(2):
+ delta = random.uniform(-self.brightness_delta,
+ self.brightness_delta)
+ img += delta
+
+ # mode == 0 --> do random contrast first
+ # mode == 1 --> do random contrast last
+ mode = random.randint(2)
+ if mode == 1:
+ if random.randint(2):
+ alpha = random.uniform(self.contrast_lower,
+ self.contrast_upper)
+ img *= alpha
+
+ # convert color from BGR to HSV
+ img = bgr2hsv(img)
+
+ # random saturation
+ if random.randint(2):
+ img[..., 1] *= random.uniform(self.saturation_lower,
+ self.saturation_upper)
+
+ # random hue
+ if random.randint(2):
+ img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+ img[..., 0][img[..., 0] > 360] -= 360
+ img[..., 0][img[..., 0] < 0] += 360
+
+ # convert color from HSV to BGR
+ img = hsv2bgr(img)
+
+ # random contrast
+ if mode == 0:
+ if random.randint(2):
+ alpha = random.uniform(self.contrast_lower,
+ self.contrast_upper)
+ img *= alpha
+
+ # randomly swap channels
+ if random.randint(2):
+ img = img[..., random.permutation(3)]
+ new_imgs.append(img)
+ results['img'] = new_imgs
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+ repr_str += 'contrast_range='
+ repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+ repr_str += 'saturation_range='
+ repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+ repr_str += f'hue_delta={self.hue_delta})'
+ return repr_str
+
+
+
+@PIPELINES.register_module()
+class CustomCollect3D(object):
+ """Collect data from the loader relevant to the specific task.
+ This is usually the last stage of the data loader pipeline. Typically keys
+ is set to some subset of "img", "proposals", "gt_bboxes",
+ "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+ The "img_meta" item is always populated. The contents of the "img_meta"
+ dictionary depends on "meta_keys". By default this includes:
+ - 'img_shape': shape of the image input to the network as a tuple \
+ (h, w, c). Note that images may be zero padded on the \
+ bottom/right if the batch tensor is larger than this shape.
+ - 'scale_factor': a float indicating the preprocessing scale
+ - 'flip': a boolean indicating if image flip transform was used
+ - 'filename': path to the image file
+ - 'ori_shape': original shape of the image as a tuple (h, w, c)
+ - 'pad_shape': image shape after padding
+ - 'lidar2img': transform from lidar to image
+ - 'depth2img': transform from depth to image
+ - 'cam2img': transform from camera to image
+ - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+ flipped horizontally
+ - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+ flipped vertically
+ - 'box_mode_3d': 3D box mode
+ - 'box_type_3d': 3D box type
+ - 'img_norm_cfg': a dict of normalization information:
+ - mean: per channel mean subtraction
+ - std: per channel std divisor
+ - to_rgb: bool indicating if bgr was converted to rgb
+ - 'pcd_trans': point cloud transformations
+ - 'sample_idx': sample index
+ - 'pcd_scale_factor': point cloud scale factor
+ - 'pcd_rotation': rotation applied to point cloud
+ - 'pts_filename': path to point cloud file.
+ Args:
+ keys (Sequence[str]): Keys of results to be collected in ``data``.
+ meta_keys (Sequence[str], optional): Meta keys to be converted to
+ ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+ Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+ 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+ 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+ 'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+ 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+ """
+
+ def __init__(self,
+ keys,
+ meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+ 'depth2img', 'cam2img', 'pad_shape',
+ 'scale_factor', 'flip', 'pcd_horizontal_flip',
+ 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+ 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
+ 'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+ 'transformation_3d_flow', 'scene_token',
+ 'can_bus','folder','frame_idx'
+ )):
+ # TODO(yzj) bevformer meta_keys has lidar2cam
+ self.keys = keys
+ self.meta_keys = meta_keys
+
+ def __call__(self, results):
+ """Call function to collect keys in results. The keys in ``meta_keys``
+ will be converted to :obj:`mmcv.DataContainer`.
+ Args:
+ results (dict): Result dict contains the data to collect.
+ Returns:
+ dict: The result dict contains the following keys
+ - keys in ``self.keys``
+ - ``img_metas``
+ """
+
+ data = {}
+ img_metas = {}
+ for key in self.meta_keys:
+ if key in results:
+ img_metas[key] = results[key]
+
+ data['img_metas'] = DC(img_metas, cpu_only=True)
+ for key in self.keys:
+ data[key] = results[key]
+ return data
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ return self.__class__.__name__ + \
+ f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+
+@PIPELINES.register_module()
+class RandomScaleImageMultiViewImage(object):
+ """Random scale the image
+ Args:
+ scales
+ """
+
+ def __init__(self, scales=[]):
+ self.scales = scales
+ assert len(self.scales)==1
+
+ def __call__(self, results):
+ """Call function to pad images, masks, semantic segmentation maps.
+ Args:
+ results (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Updated result dict.
+ """
+ rand_ind = np.random.permutation(range(len(self.scales)))[0]
+ rand_scale = self.scales[rand_ind]
+
+ y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
+ x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
+ scale_factor = np.eye(4)
+ scale_factor[0, 0] *= rand_scale
+ scale_factor[1, 1] *= rand_scale
+ results['img'] = [imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
+ enumerate(results['img'])]
+ lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
+ results['lidar2img'] = lidar2img
+ results['img_shape'] = [img.shape for img in results['img']]
+ results['ori_shape'] = [img.shape for img in results['img']]
+
+ return results
+
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(size={self.scales}, '
+ return repr_str
+
+@PIPELINES.register_module()
+class ObjectRangeFilterTrack(object):
+ """Filter objects by the range.
+ Args:
+ point_cloud_range (list[float]): Point cloud range.
+ """
+
+ def __init__(self, point_cloud_range):
+ self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+ def __call__(self, input_dict):
+ """Call function to filter objects by the range.
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+ keys are updated in the result dict.
+ """
+ # Check points instance type and initialise bev_range
+ if isinstance(input_dict['gt_bboxes_3d'],
+ (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+ bev_range = self.pcd_range[[0, 1, 3, 4]]
+ elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+ bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+ if 'gt_inds' in input_dict['ann_info'].keys():
+ input_dict['gt_inds'] = input_dict['ann_info']['gt_inds']
+ if 'gt_fut_traj' in input_dict['ann_info'].keys():
+ input_dict['gt_fut_traj'] = input_dict['ann_info']['gt_fut_traj']
+ if 'gt_fut_traj_mask' in input_dict['ann_info'].keys():
+ input_dict['gt_fut_traj_mask'] = input_dict['ann_info']['gt_fut_traj_mask']
+ if 'gt_past_traj' in input_dict['ann_info'].keys():
+ input_dict['gt_past_traj'] = input_dict['ann_info']['gt_past_traj']
+ if 'gt_past_traj_mask' in input_dict['ann_info'].keys():
+ input_dict['gt_past_traj_mask'] = input_dict['ann_info']['gt_past_traj_mask']
+ if 'gt_sdc_bbox' in input_dict['ann_info'].keys():
+ input_dict['gt_sdc_bbox'] = input_dict['ann_info']['gt_sdc_bbox']
+ input_dict['gt_sdc_label'] = input_dict['ann_info']['gt_sdc_label']
+ input_dict['gt_sdc_fut_traj'] = input_dict['ann_info']['gt_sdc_fut_traj']
+ input_dict['gt_sdc_fut_traj_mask'] = input_dict['ann_info']['gt_sdc_fut_traj_mask']
+
+ gt_bboxes_3d = input_dict['gt_bboxes_3d']
+ gt_labels_3d = input_dict['gt_labels_3d']
+ gt_inds = input_dict['gt_inds']
+ gt_fut_traj = input_dict['gt_fut_traj']
+ gt_fut_traj_mask = input_dict['gt_fut_traj_mask']
+ gt_past_traj = input_dict['gt_past_traj']
+ gt_past_traj_mask = input_dict['gt_past_traj_mask']
+
+ mask = gt_bboxes_3d.in_range_bev(bev_range)
+ gt_bboxes_3d = gt_bboxes_3d[mask]
+ # mask is a torch tensor but gt_labels_3d is still numpy array
+ # using mask to index gt_labels_3d will cause bug when
+ # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+ # as gt_labels_3d[1] and cause out of index error
+ mask = mask.numpy().astype(np.bool)
+ gt_labels_3d = gt_labels_3d[mask]
+ gt_inds = gt_inds[mask]
+ gt_fut_traj = gt_fut_traj[mask]
+ gt_fut_traj_mask = gt_fut_traj_mask[mask]
+ gt_past_traj = gt_past_traj[mask]
+ gt_past_traj_mask = gt_past_traj_mask[mask]
+
+ # limit rad to [-pi, pi]
+ gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+ input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+ input_dict['gt_labels_3d'] = gt_labels_3d
+ input_dict['gt_inds'] = gt_inds
+ input_dict['gt_fut_traj'] = gt_fut_traj
+ input_dict['gt_fut_traj_mask'] = gt_fut_traj_mask
+ input_dict['gt_past_traj'] = gt_past_traj
+ input_dict['gt_past_traj_mask'] = gt_past_traj_mask
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+ return repr_str
+
+@PIPELINES.register_module()
+class ObjectNameFilterTrack(object):
+ """Filter GT objects by their names.
+ Args:
+ classes (list[str]): List of class names to be kept for training.
+ """
+
+ def __init__(self, classes):
+ self.classes = classes
+ self.labels = list(range(len(self.classes)))
+
+ def __call__(self, input_dict):
+ """Call function to filter objects by their names.
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+ keys are updated in the result dict.
+ """
+ gt_labels_3d = input_dict['gt_labels_3d']
+ gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+ dtype=np.bool_)
+ input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+ input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+ input_dict['gt_inds'] = input_dict['gt_inds'][gt_bboxes_mask]
+ input_dict['gt_fut_traj'] = input_dict['gt_fut_traj'][gt_bboxes_mask]
+ input_dict['gt_fut_traj_mask'] = input_dict['gt_fut_traj_mask'][gt_bboxes_mask]
+ input_dict['gt_past_traj'] = input_dict['gt_past_traj'][gt_bboxes_mask]
+ input_dict['gt_past_traj_mask'] = input_dict['gt_past_traj_mask'][gt_bboxes_mask]
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(classes={self.classes})'
+ return repr_str
+
+@PIPELINES.register_module()
+class CustomObjectRangeFilter(ObjectRangeFilter):
+ def __call__(self, results):
+ """Call function to filter objects by the range.
+ Args:
+ results (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+ keys are updated in the result dict.
+ """
+ # Check points instance type and initialise bev_range
+ if isinstance(results['gt_bboxes_3d'],
+ (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+ bev_range = self.pcd_range[[0, 1, 3, 4]]
+ elif isinstance(results['gt_bboxes_3d'], CameraInstance3DBoxes):
+ bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+ gt_bboxes_3d = results['gt_bboxes_3d']
+ gt_labels_3d = results['gt_labels_3d']
+ mask = gt_bboxes_3d.in_range_bev(bev_range)
+ gt_bboxes_3d = gt_bboxes_3d[mask]
+ # mask is a torch tensor but gt_labels_3d is still numpy array
+ # using mask to index gt_labels_3d will cause bug when
+ # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+ # as gt_labels_3d[1] and cause out of index error
+ gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+ # limit rad to [-pi, pi]
+ gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+ results['gt_bboxes_3d'] = gt_bboxes_3d
+ results['gt_labels_3d'] = gt_labels_3d
+ # results['ann_tokens'] = results['ann_tokens'][mask.numpy().astype(np.bool)]
+
+ return results
+
+@PIPELINES.register_module()
+class CustomObjectNameFilter(ObjectNameFilter):
+ def __call__(self, results):
+ """Call function to filter objects by their names.
+ Args:
+ results (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+ keys are updated in the result dict.
+ """
+ gt_labels_3d = results['gt_labels_3d']
+ gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+ dtype=np.bool_)
+ results['gt_bboxes_3d'] = results['gt_bboxes_3d'][gt_bboxes_mask]
+ results['gt_labels_3d'] = results['gt_labels_3d'][gt_bboxes_mask]
+ # results['ann_tokens'] = results['ann_tokens'][gt_bboxes_mask]
+
+ return results
+
+
+@PIPELINES.register_module()
+class VADObjectRangeFilter(object):
+ """Filter objects by the range, and also filter corresponding fut trajs
+
+ Args:
+ point_cloud_range (list[float]): Point cloud range.
+ """
+
+ def __init__(self, point_cloud_range):
+ self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+ def __call__(self, input_dict):
+ """Call function to filter objects by the range.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+ keys are updated in the result dict.
+ """
+ # Check points instance type and initialise bev_range
+ if isinstance(input_dict['gt_bboxes_3d'],
+ (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+ bev_range = self.pcd_range[[0, 1, 3, 4]]
+ elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+ bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+ gt_bboxes_3d = input_dict['gt_bboxes_3d']
+ gt_labels_3d = input_dict['gt_labels_3d']
+
+
+ mask = gt_bboxes_3d.in_range_bev(bev_range)
+ gt_bboxes_3d = gt_bboxes_3d[mask]
+ # mask is a torch tensor but gt_labels_3d is still numpy array
+ # using mask to index gt_labels_3d will cause bug when
+ # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+ # as gt_labels_3d[1] and cause out of index error
+ gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+
+ # limit rad to [-pi, pi]
+ gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+ input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+ input_dict['gt_labels_3d'] = gt_labels_3d
+
+ if 'attr_labels' in input_dict:
+ gt_attr_labels = input_dict['attr_labels']
+ gt_attr_labels = gt_attr_labels[mask.numpy().astype(np.bool)]
+ input_dict['gt_attr_labels'] = gt_attr_labels
+
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+ return repr_str
+
+
+@PIPELINES.register_module()
+class VADObjectNameFilter(object):
+ """Filter GT objects by their names, , and also filter corresponding fut trajs
+
+ Args:
+ classes (list[str]): List of class names to be kept for training.
+ """
+
+ def __init__(self, classes):
+ self.classes = classes
+ self.labels = list(range(len(self.classes)))
+
+ def __call__(self, input_dict):
+ """Call function to filter objects by their names.
+
+ Args:
+ input_dict (dict): Result dict from loading pipeline.
+
+ Returns:
+ dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+ keys are updated in the result dict.
+ """
+ gt_labels_3d = input_dict['gt_labels_3d']
+ gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+ dtype=np.bool_)
+ input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+ input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+ if 'gt_attr_labels' in input_dict:
+ input_dict['gt_attr_labels'] = input_dict['gt_attr_labels'][gt_bboxes_mask]
+
+ return input_dict
+
+ def __repr__(self):
+ """str: Return a string that describes the module."""
+ repr_str = self.__class__.__name__
+ repr_str += f'(classes={self.classes})'
+ return repr_str
+
+@PIPELINES.register_module()
+class CustomPointsRangeFilter:
+ """Filter points by the range.
+ Args:
+ point_cloud_range (list[float]): Point cloud range.
+ """
+
+ def __init__(self, point_cloud_range):
+ self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+ def __call__(self, data):
+ """Call function to filter points by the range.
+ Args:
+ data (dict): Result dict from loading pipeline.
+ Returns:
+ dict: Results after filtering, 'points', 'pts_instance_mask' \
+ and 'pts_semantic_mask' keys are updated in the result dict.
+ """
+ points = data["points"]
+ points_mask = points.in_range_3d(self.pcd_range)
+ clean_points = points[points_mask]
+ data["points"] = clean_points
+ return data
\ No newline at end of file
diff --git a/mmcv/datasets/prepare_B2D.py b/mmcv/datasets/prepare_B2D.py
new file mode 100644
index 0000000..a9b2f33
--- /dev/null
+++ b/mmcv/datasets/prepare_B2D.py
@@ -0,0 +1,401 @@
+import os
+from os.path import join
+import gzip, json, pickle
+import numpy as np
+from pyquaternion import Quaternion
+from tqdm import tqdm
+from vis_utils import calculate_cube_vertices,calculate_occlusion_stats,edges,DIS_CAR_SAVE
+import cv2
+import multiprocessing
+import argparse
+# All data in the Bench2Drive dataset are in the left-handed coordinate system.
+# This code converts all coordinate systems (world coordinate system, vehicle coordinate system,
+# camera coordinate system, and lidar coordinate system) to the right-handed coordinate system
+# consistent with the nuscenes dataset.
+
+DATAROOT = '../../data/bench2drive'
+MAP_ROOT = '../../data/bench2drive/maps'
+OUT_DIR = '../../data/infos'
+
+MAX_DISTANCE = 75 # Filter bounding boxes that are too far from the vehicle
+FILTER_Z_SHRESHOLD = 10 # Filter bounding boxes that are too high/low from the vehicle
+FILTER_INVISINLE = True # Filter bounding boxes based on visibility
+NUM_VISIBLE_SHRESHOLD = 1 # Filter bounding boxes with fewer visible vertices than this value
+NUM_OUTPOINT_SHRESHOLD = 7 # Filter bounding boxes where the number of vertices outside the frame is greater than this value in all cameras
+CAMERAS = ['CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
+CAMERA_TO_FOLDER_MAP = {'CAM_FRONT':'rgb_front', 'CAM_FRONT_LEFT':'rgb_front_left', 'CAM_FRONT_RIGHT':'rgb_front_right', 'CAM_BACK':'rgb_back', 'CAM_BACK_LEFT':'rgb_back_left', 'CAM_BACK_RIGHT':'rgb_back_right'}
+
+
+stand_to_ue4_rotate = np.array([[ 0, 0, 1, 0],
+ [ 1, 0, 0, 0],
+ [ 0,-1, 0, 0],
+ [ 0, 0, 0, 1]])
+
+
+
+lidar_to_righthand_ego = np.array([[ 0, 1, 0, 0],
+ [ -1, 0, 0, 0],
+ [ 0, 0, 1, 0],
+ [ 0, 0, 0, 1]])
+
+lefthand_ego_to_lidar = np.array([[ 0, 1, 0, 0],
+ [ 1, 0, 0, 0],
+ [ 0, 0, 1, 0],
+ [ 0, 0, 0, 1]])
+
+
+
+left2right = np.eye(4)
+left2right[1,1] = -1
+
+def apply_trans(vec,world2ego):
+ vec = np.concatenate((vec,np.array([1])))
+ t = world2ego @ vec
+ return t[0:3]
+
+def get_pose_matrix(dic):
+ new_matrix = np.zeros((4,4))
+ new_matrix[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=dic['theta']-np.pi/2).rotation_matrix
+ new_matrix[0,3] = dic['x']
+ new_matrix[1,3] = dic['y']
+ new_matrix[3,3] = 1
+ return new_matrix
+
+def get_npc2world(npc):
+ for key in ['world2vehicle','world2ego','world2sign','world2ped']:
+ if key in npc.keys():
+ npc2world = np.linalg.inv(np.array(npc[key]))
+ yaw_from_matrix = np.arctan2(npc2world[1,0], npc2world[0,0])
+ yaw = npc['rotation'][-1]/180*np.pi
+ if abs(yaw-yaw_from_matrix)> 0.01:
+ npc2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=yaw).rotation_matrix
+ npc2world = left2right@npc2world@left2right
+ return npc2world
+ npc2world = np.eye(4)
+ npc2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=npc['rotation'][-1]/180*np.pi).rotation_matrix
+ npc2world[0:3,3] = np.array(npc['location'])
+ return left2right@npc2world@left2right
+
+
+def get_global_trigger_vertex(center,extent,yaw_in_degree):
+ x,y = center[0],-center[1]
+ dx,dy = extent[0],extent[1]
+ yaw_in_radians = -yaw_in_degree/180*np.pi
+ vertex_in_self = np.array([[dx,dy],
+ [-dx,dy],
+ [-dx,-dy],
+ [dx,-dy]])
+ rotate_matrix = np.array([[np.cos(yaw_in_radians),-np.sin(yaw_in_radians)],
+ [np.sin(yaw_in_radians), np.cos(yaw_in_radians)]])
+ rotated_vertex = (rotate_matrix @ vertex_in_self.T).T
+ vertex_in_global = np.array([[x,y]]).repeat(4,axis=0) + rotated_vertex
+ return vertex_in_global
+
+
+
+def get_image_point(loc, K, w2c):
+ point = np.array([loc[0], loc[1], loc[2], 1])
+ point_camera = np.dot(w2c, point)
+ point_camera = point_camera[0:3]
+ depth = point_camera[2]
+ point_img = np.dot(K, point_camera)
+ point_img[0] /= point_img[2]
+ point_img[1] /= point_img[2]
+ return point_img[0:2], depth
+
+def get_action(index):
+ Discrete_Actions_DICT = {
+ 0: (0, 0, 1, False),
+ 1: (0.7, -0.5, 0, False),
+ 2: (0.7, -0.3, 0, False),
+ 3: (0.7, -0.2, 0, False),
+ 4: (0.7, -0.1, 0, False),
+ 5: (0.7, 0, 0, False),
+ 6: (0.7, 0.1, 0, False),
+ 7: (0.7, 0.2, 0, False),
+ 8: (0.7, 0.3, 0, False),
+ 9: (0.7, 0.5, 0, False),
+ 10: (0.3, -0.7, 0, False),
+ 11: (0.3, -0.5, 0, False),
+ 12: (0.3, -0.3, 0, False),
+ 13: (0.3, -0.2, 0, False),
+ 14: (0.3, -0.1, 0, False),
+ 15: (0.3, 0, 0, False),
+ 16: (0.3, 0.1, 0, False),
+ 17: (0.3, 0.2, 0, False),
+ 18: (0.3, 0.3, 0, False),
+ 19: (0.3, 0.5, 0, False),
+ 20: (0.3, 0.7, 0, False),
+ 21: (0, -1, 0, False),
+ 22: (0, -0.6, 0, False),
+ 23: (0, -0.3, 0, False),
+ 24: (0, -0.1, 0, False),
+ 25: (1, 0, 0, False),
+ 26: (0, 0.1, 0, False),
+ 27: (0, 0.3, 0, False),
+ 28: (0, 0.6, 0, False),
+ 29: (0, 1.0, 0, False),
+ 30: (0.5, -0.5, 0, True),
+ 31: (0.5, -0.3, 0, True),
+ 32: (0.5, -0.2, 0, True),
+ 33: (0.5, -0.1, 0, True),
+ 34: (0.5, 0, 0, True),
+ 35: (0.5, 0.1, 0, True),
+ 36: (0.5, 0.2, 0, True),
+ 37: (0.5, 0.3, 0, True),
+ 38: (0.5, 0.5, 0, True),
+ }
+ throttle, steer, brake, reverse = Discrete_Actions_DICT[index]
+ return throttle, steer, brake
+
+
+def gengrate_map(map_root):
+ map_infos = {}
+ for file_name in os.listdir(map_root):
+ if '.npz' in file_name:
+ map_info = dict(np.load(join(map_root,file_name), allow_pickle=True)['arr'])
+ town_name = file_name.split('_')[0]
+ map_infos[town_name] = {}
+ lane_points = []
+ lane_types = []
+ lane_sample_points = []
+ trigger_volumes_points = []
+ trigger_volumes_types = []
+ trigger_volumes_sample_points = []
+ for road_id, road in map_info.items():
+ for lane_id, lane in road.items():
+ if lane_id == 'Trigger_Volumes':
+ for single_trigger_volume in lane:
+ points = np.array(single_trigger_volume['Points'])
+ points[:,1] *= -1
+ trigger_volumes_points.append(points)
+ trigger_volumes_sample_points.append(points.mean(axis=0))
+ trigger_volumes_types.append(single_trigger_volume['Type'])
+ else:
+ for single_lane in lane:
+ points = np.array([raw_point[0] for raw_point in single_lane['Points']])
+ points[:,1] *= -1
+ lane_points.append(points)
+ lane_types.append(single_lane['Type'])
+ lane_lenth = points.shape[0]
+ if lane_lenth % 50 !=0:
+ devide_points = [50*i for i in range(lane_lenth//50+1)]
+ else:
+ devide_points = [50*i for i in range(lane_lenth//50)]
+ devide_points.append(lane_lenth-1)
+ lane_sample_points_tmp = points[devide_points]
+ lane_sample_points.append(lane_sample_points_tmp)
+ map_infos[town_name]['lane_points'] = lane_points
+ map_infos[town_name]['lane_sample_points'] = lane_sample_points
+ map_infos[town_name]['lane_types'] = lane_types
+ map_infos[town_name]['trigger_volumes_points'] = trigger_volumes_points
+ map_infos[town_name]['trigger_volumes_sample_points'] = trigger_volumes_sample_points
+ map_infos[town_name]['trigger_volumes_types'] = trigger_volumes_types
+ with open(join(OUT_DIR,'b2d_map_infos.pkl'),'wb') as f:
+ pickle.dump(map_infos,f)
+
+def preprocess(folder_list,idx,tmp_dir,train_or_val):
+
+ data_root = DATAROOT
+ cameras = CAMERAS
+ final_data = []
+ if idx == 0:
+ folders = tqdm(folder_list)
+ else:
+ folders = folder_list
+
+ for folder_name in folders:
+ folder_path = join(data_root, folder_name)
+ last_position_dict = {}
+ for ann_name in sorted(os.listdir(join(folder_path,'anno')),key= lambda x: int(x.split('.')[0])):
+ position_dict = {}
+ frame_data = {}
+ cam_gray_depth = {}
+ with gzip.open(join(folder_path,'anno',ann_name), 'rt', encoding='utf-8') as gz_file:
+ anno = json.load(gz_file)
+ frame_data['folder'] = folder_name
+ frame_data['town_name'] = folder_name.split('/')[1].split('_')[1]
+ frame_data['command_far_xy'] = np.array([anno['x_command_far'],-anno['y_command_far']])
+ frame_data['command_far'] = anno['command_far']
+ frame_data['command_near_xy'] = np.array([anno['x_command_near'],-anno['y_command_near']])
+ frame_data['command_near'] = anno['command_near']
+ frame_data['frame_idx'] = int(ann_name.split('.')[0])
+ frame_data['ego_yaw'] = -np.nan_to_num(anno['theta'],nan=np.pi)+np.pi/2
+ frame_data['ego_translation'] = np.array([anno['x'],-anno['y'],0])
+ frame_data['ego_vel'] = np.array([anno['speed'],0,0])
+ frame_data['ego_accel'] = np.array([anno['acceleration'][0],-anno['acceleration'][1],anno['acceleration'][2]])
+ frame_data['ego_rotation_rate'] = -np.array(anno['angular_velocity'])
+ frame_data['ego_size'] = np.array([anno['bounding_boxes'][0]['extent'][1],anno['bounding_boxes'][0]['extent'][0],anno['bounding_boxes'][0]['extent'][2]])*2
+ world2ego = left2right@anno['bounding_boxes'][0]['world2ego']@left2right
+ frame_data['world2ego'] = world2ego
+ if frame_data['frame_idx'] == 0:
+ expert_file_path = join(folder_path,'expert_assessment','-0001.npz')
+ else:
+ expert_file_path = join(folder_path,'expert_assessment',str(frame_data['frame_idx']-1).zfill(5)+'.npz')
+ expert_data = np.load(expert_file_path,allow_pickle=True)['arr_0']
+ action_id = expert_data[-1]
+ # value = expert_data[-2]
+ # expert_feature = expert_data[:-2]
+ throttle, steer, brake = get_action(action_id)
+ frame_data['brake'] = brake
+ frame_data['throttle'] = throttle
+ frame_data['steer'] = steer
+ #frame_data['action_id'] = action_id
+ #frame_data['value'] = value
+ #frame_data['expert_feature'] = expert_feature
+ ###get sensor infos###
+ sensor_infos = {}
+ for cam in CAMERAS:
+ sensor_infos[cam] = {}
+ sensor_infos[cam]['cam2ego'] = left2right @ np.array(anno['sensors'][cam]['cam2ego']) @stand_to_ue4_rotate
+ sensor_infos[cam]['intrinsic'] = np.array(anno['sensors'][cam]['intrinsic'])
+ sensor_infos[cam]['world2cam'] = np.linalg.inv(stand_to_ue4_rotate) @ np.array(anno['sensors'][cam]['world2cam']) @left2right
+ sensor_infos[cam]['data_path'] = join(folder_name,'camera',CAMERA_TO_FOLDER_MAP[cam],ann_name.split('.')[0]+'.jpg')
+ cam_gray_depth[cam] = cv2.imread(join(data_root,sensor_infos[cam]['data_path']).replace('rgb_','depth_').replace('.jpg','.png'))[:,:,0]
+ sensor_infos['LIDAR_TOP'] = {}
+ sensor_infos['LIDAR_TOP']['lidar2ego'] = np.array(anno['sensors']['LIDAR_TOP']['lidar2ego']) @ lidar_to_righthand_ego
+ world2lidar = lefthand_ego_to_lidar @ np.array(anno['sensors']['LIDAR_TOP']['world2lidar']) @ left2right
+ sensor_infos['LIDAR_TOP']['world2lidar'] = world2lidar
+ frame_data['sensors'] = sensor_infos
+ ###get bounding_boxes infos###
+ gt_boxes = []
+ gt_names = []
+ gt_ids = []
+ num_points_list = []
+ npc2world_list = []
+ for npc in anno['bounding_boxes']:
+ if npc['class'] == 'ego_vehicle': continue
+ if npc['distance'] > MAX_DISTANCE: continue
+ if abs(npc['location'][2] - anno['bounding_boxes'][0]['location'][2]) > FILTER_Z_SHRESHOLD: continue
+ center = np.array([npc['center'][0],-npc['center'][1],npc['center'][2]]) # left hand -> right hand
+ extent = np.array([npc['extent'][1],npc['extent'][0],npc['extent'][2]]) # lwh -> wlh
+ position_dict[npc['id']] = center
+ local_center = apply_trans(center, world2lidar)
+ size = extent*2
+ if 'world2vehicle' in npc.keys():
+ world2vehicle = left2right@np.array(npc['world2vehicle'])@left2right
+ vehicle2lidar = world2lidar @ np.linalg.inv(world2vehicle)
+ yaw_local = np.arctan2(vehicle2lidar[1,0], vehicle2lidar[0,0])
+
+ else:
+ yaw_local = -npc['rotation'][-1]/180*np.pi - frame_data['ego_yaw'] +np.pi / 2
+ yaw_local_in_lidar_box = -yaw_local - np.pi / 2
+ while yaw_local < -np.pi:
+ yaw_local += 2*np.pi
+ while yaw_local > np.pi:
+ yaw_local -= 2*np.pi
+ if 'speed' in npc.keys():
+ if 'vehicle' in npc['class']: # only vehicles have correct speed
+ speed = npc['speed']
+ else:
+ if npc['id'] in last_position_dict.keys(): #calculate speed for other object
+ speed = np.linalg.norm((center-last_position_dict[npc['id']])[0:2]) * 10
+ else:
+ speed = 0
+ else:
+ speed = 0
+ if 'num_points' in npc.keys():
+ num_points = npc['num_points']
+ else:
+ num_points = -1
+ npc2world = get_npc2world(npc)
+ speed_x = speed * np.cos(yaw_local)
+ speed_y = speed * np.sin(yaw_local)
+
+ ###fliter_bounding_boxes###
+ if FILTER_INVISINLE:
+ valid = False
+ box2lidar = np.eye(4)
+ box2lidar[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=yaw_local).rotation_matrix
+ box2lidar[0:3,3] = local_center
+ lidar2box = np.linalg.inv(box2lidar)
+ raw_verts = calculate_cube_vertices(local_center,extent)
+ verts = []
+ for raw_vert in raw_verts:
+ tmp = np.dot(lidar2box, [raw_vert[0], raw_vert[1], raw_vert[2],1])
+ tmp[0:3] += local_center
+ verts.append(tmp.tolist()[:-1])
+ for cam in cameras:
+ lidar2cam = np.linalg.inv(frame_data['sensors'][cam]['cam2ego']) @ sensor_infos['LIDAR_TOP']['lidar2ego']
+ test_points = []
+ test_depth = []
+ for vert in verts:
+ point, depth = get_image_point(vert, frame_data['sensors'][cam]['intrinsic'], lidar2cam)
+ if depth > 0:
+ test_points.append(point)
+ test_depth.append(depth)
+
+ num_visible_vertices, num_invisible_vertices, num_vertices_outside_camera, colored_points = calculate_occlusion_stats(np.array(test_points), np.array(test_depth), cam_gray_depth[cam], max_render_depth=MAX_DISTANCE)
+ if num_visible_vertices>NUM_VISIBLE_SHRESHOLD and num_vertices_outside_camera 0:
+ indice = np.where(self.flag == i)[0]
+ assert len(indice) == size
+ # TODO: check whether torch.randperm() can be replaced by
+ # numpy.random.permutation().
+ indice = indice[list(
+ torch.randperm(int(size), generator=g).numpy())].tolist()
+ extra = int(
+ math.ceil(
+ size * 1.0 / self.samples_per_gpu / self.num_replicas)
+ ) * self.samples_per_gpu * self.num_replicas - len(indice)
+ # pad indice
+ tmp = indice.copy()
+ for _ in range(extra // size):
+ indice.extend(tmp)
+ indice.extend(tmp[:extra % size])
+ indices.extend(indice)
+
+ assert len(indices) == self.total_size
+
+ indices = [
+ indices[j] for i in list(
+ torch.randperm(
+ len(indices) // self.samples_per_gpu, generator=g))
+ for j in range(i * self.samples_per_gpu, (i + 1) *
+ self.samples_per_gpu)
+ ]
+
+ # subsample
+ offset = self.num_samples * self.rank
+ indices = indices[offset:offset + self.num_samples]
+ assert len(indices) == self.num_samples
+
+ return iter(indices)
+
+ def __len__(self):
+ return self.num_samples
+
+ def set_epoch(self, epoch):
+ self.epoch = epoch
diff --git a/mmcv/datasets/samplers/sampler.py b/mmcv/datasets/samplers/sampler.py
new file mode 100644
index 0000000..1906049
--- /dev/null
+++ b/mmcv/datasets/samplers/sampler.py
@@ -0,0 +1,7 @@
+from mmcv.utils.registry import Registry, build_from_cfg
+
+SAMPLER = Registry('sampler')
+
+
+def build_sampler(cfg, default_args):
+ return build_from_cfg(cfg, SAMPLER, default_args)
diff --git a/mmcv/datasets/utils.py b/mmcv/datasets/utils.py
new file mode 100644
index 0000000..02cf96d
--- /dev/null
+++ b/mmcv/datasets/utils.py
@@ -0,0 +1,298 @@
+import copy
+import warnings
+from mmcv.models import VGG
+from mmcv.runner.hooks import HOOKS, Hook
+
+from mmcv.datasets.pipelines import (Collect3D, DefaultFormatBundle3D,
+ LoadAnnotations3D,
+ LoadImageFromFileMono3D,
+ LoadMultiViewImageFromFiles,
+ LoadPointsFromFile,
+ LoadPointsFromMultiSweeps,
+ MultiScaleFlipAug3D,
+ PointSegClassMapping)
+
+from mmcv.datasets.builder import PIPELINES
+from mmcv.datasets.pipelines import LoadAnnotations, LoadImageFromFile
+from mmcv.models.dense_heads import GARPNHead, RPNHead
+from mmcv.models.roi_heads.mask_heads import FusedSemanticHead
+from mmcv.parallel import DataContainer
+
+
+def replace_ImageToTensor(pipelines):
+ """Replace the ImageToTensor transform in a data pipeline to
+ DefaultFormatBundle, which is normally useful in batch inference.
+
+ Args:
+ pipelines (list[dict]): Data pipeline configs.
+
+ Returns:
+ list: The new pipeline list with all ImageToTensor replaced by
+ DefaultFormatBundle.
+
+ Examples:
+ >>> pipelines = [
+ ... dict(type='LoadImageFromFile'),
+ ... dict(
+ ... type='MultiScaleFlipAug',
+ ... img_scale=(1333, 800),
+ ... flip=False,
+ ... transforms=[
+ ... dict(type='Resize', keep_ratio=True),
+ ... dict(type='RandomFlip'),
+ ... dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+ ... dict(type='Pad', size_divisor=32),
+ ... dict(type='ImageToTensor', keys=['img']),
+ ... dict(type='Collect', keys=['img']),
+ ... ])
+ ... ]
+ >>> expected_pipelines = [
+ ... dict(type='LoadImageFromFile'),
+ ... dict(
+ ... type='MultiScaleFlipAug',
+ ... img_scale=(1333, 800),
+ ... flip=False,
+ ... transforms=[
+ ... dict(type='Resize', keep_ratio=True),
+ ... dict(type='RandomFlip'),
+ ... dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+ ... dict(type='Pad', size_divisor=32),
+ ... dict(type='DefaultFormatBundle'),
+ ... dict(type='Collect', keys=['img']),
+ ... ])
+ ... ]
+ >>> assert expected_pipelines == replace_ImageToTensor(pipelines)
+ """
+ pipelines = copy.deepcopy(pipelines)
+ for i, pipeline in enumerate(pipelines):
+ if pipeline['type'] == 'MultiScaleFlipAug':
+ assert 'transforms' in pipeline
+ pipeline['transforms'] = replace_ImageToTensor(
+ pipeline['transforms'])
+ elif pipeline['type'] == 'ImageToTensor':
+ warnings.warn(
+ '"ImageToTensor" pipeline is replaced by '
+ '"DefaultFormatBundle" for batch inference. It is '
+ 'recommended to manually replace it in the test '
+ 'data pipeline in your config file.', UserWarning)
+ pipelines[i] = {'type': 'DefaultFormatBundle'}
+ return pipelines
+
+
+# def get_loading_pipeline(pipeline):
+# """Only keep loading image and annotations related configuration.
+
+# Args:
+# pipeline (list[dict]): Data pipeline configs.
+
+# Returns:
+# list[dict]: The new pipeline list with only keep
+# loading image and annotations related configuration.
+
+# Examples:
+# >>> pipelines = [
+# ... dict(type='LoadImageFromFile'),
+# ... dict(type='LoadAnnotations', with_bbox=True),
+# ... dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+# ... dict(type='RandomFlip', flip_ratio=0.5),
+# ... dict(type='Normalize', **img_norm_cfg),
+# ... dict(type='Pad', size_divisor=32),
+# ... dict(type='DefaultFormatBundle'),
+# ... dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+# ... ]
+# >>> expected_pipelines = [
+# ... dict(type='LoadImageFromFile'),
+# ... dict(type='LoadAnnotations', with_bbox=True)
+# ... ]
+# >>> assert expected_pipelines ==\
+# ... get_loading_pipeline(pipelines)
+# """
+# loading_pipeline_cfg = []
+# for cfg in pipeline:
+# obj_cls = PIPELINES.get(cfg['type'])
+# # TODO:use more elegant way to distinguish loading modules
+# if obj_cls is not None and obj_cls in (LoadImageFromFile,
+# LoadAnnotations):
+# loading_pipeline_cfg.append(cfg)
+# assert len(loading_pipeline_cfg) == 2, \
+# 'The data pipeline in your config file must include ' \
+# 'loading image and annotations related pipeline.'
+# return loading_pipeline_cfg
+
+
+@HOOKS.register_module()
+class NumClassCheckHook(Hook):
+
+ def _check_head(self, runner):
+ """Check whether the `num_classes` in head matches the length of
+ `CLASSSES` in `dataset`.
+
+ Args:
+ runner (obj:`EpochBasedRunner`): Epoch based Runner.
+ """
+ model = runner.model
+ dataset = runner.data_loader.dataset
+ if dataset.CLASSES is None:
+ runner.logger.warning(
+ f'Please set `CLASSES` '
+ f'in the {dataset.__class__.__name__} and'
+ f'check if it is consistent with the `num_classes` '
+ f'of head')
+ else:
+ assert type(dataset.CLASSES) is not str, \
+ (f'`CLASSES` in {dataset.__class__.__name__}'
+ f'should be a tuple of str.'
+ f'Add comma if number of classes is 1 as '
+ f'CLASSES = ({dataset.CLASSES},)')
+ for name, module in model.named_modules():
+ if hasattr(module, 'num_classes') and not isinstance(
+ module, (RPNHead, VGG, FusedSemanticHead, GARPNHead)):
+ assert module.num_classes == len(dataset.CLASSES), \
+ (f'The `num_classes` ({module.num_classes}) in '
+ f'{module.__class__.__name__} of '
+ f'{model.__class__.__name__} does not matches '
+ f'the length of `CLASSES` '
+ f'{len(dataset.CLASSES)}) in '
+ f'{dataset.__class__.__name__}')
+
+ def before_train_epoch(self, runner):
+ """Check whether the training dataset is compatible with head.
+
+ Args:
+ runner (obj:`EpochBasedRunner`): Epoch based Runner.
+ """
+ self._check_head(runner)
+
+ def before_val_epoch(self, runner):
+ """Check whether the dataset in val epoch is compatible with head.
+
+ Args:
+ runner (obj:`EpochBasedRunner`): Epoch based Runner.
+ """
+ self._check_head(runner)
+
+
+def is_loading_function(transform):
+ """Judge whether a transform function is a loading function.
+
+ Note: `MultiScaleFlipAug3D` is a wrapper for multiple pipeline functions,
+ so we need to search if its inner transforms contain any loading function.
+
+ Args:
+ transform (dict | :obj:`Pipeline`): A transform config or a function.
+
+ Returns:
+ bool | None: Whether it is a loading function. None means can't judge.
+ When transform is `MultiScaleFlipAug3D`, we return None.
+ """
+ # TODO: use more elegant way to distinguish loading modules
+ loading_functions = (LoadImageFromFile, LoadPointsFromFile,
+ LoadAnnotations3D, LoadMultiViewImageFromFiles,
+ LoadPointsFromMultiSweeps, DefaultFormatBundle3D,
+ Collect3D, LoadImageFromFileMono3D,
+ PointSegClassMapping)
+ if isinstance(transform, dict):
+ obj_cls = PIPELINES.get(transform['type'])
+ if obj_cls is None:
+ return False
+ if obj_cls in loading_functions:
+ return True
+ if obj_cls in (MultiScaleFlipAug3D, ):
+ return None
+ elif callable(transform):
+ if isinstance(transform, loading_functions):
+ return True
+ if isinstance(transform, MultiScaleFlipAug3D):
+ return None
+ return False
+
+
+def get_loading_pipeline(pipeline):
+ """Only keep loading image, points and annotations related configuration.
+
+ Args:
+ pipeline (list[dict] | list[:obj:`Pipeline`]):
+ Data pipeline configs or list of pipeline functions.
+
+ Returns:
+ list[dict] | list[:obj:`Pipeline`]): The new pipeline list with only
+ keep loading image, points and annotations related configuration.
+
+ Examples:
+ >>> pipelines = [
+ ... dict(type='LoadPointsFromFile',
+ ... coord_type='LIDAR', load_dim=4, use_dim=4),
+ ... dict(type='LoadImageFromFile'),
+ ... dict(type='LoadAnnotations3D',
+ ... with_bbox=True, with_label_3d=True),
+ ... dict(type='Resize',
+ ... img_scale=[(640, 192), (2560, 768)], keep_ratio=True),
+ ... dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+ ... dict(type='PointsRangeFilter',
+ ... point_cloud_range=point_cloud_range),
+ ... dict(type='ObjectRangeFilter',
+ ... point_cloud_range=point_cloud_range),
+ ... dict(type='PointShuffle'),
+ ... dict(type='Normalize', **img_norm_cfg),
+ ... dict(type='Pad', size_divisor=32),
+ ... dict(type='DefaultFormatBundle3D', class_names=class_names),
+ ... dict(type='Collect3D',
+ ... keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+ ... ]
+ >>> expected_pipelines = [
+ ... dict(type='LoadPointsFromFile',
+ ... coord_type='LIDAR', load_dim=4, use_dim=4),
+ ... dict(type='LoadImageFromFile'),
+ ... dict(type='LoadAnnotations3D',
+ ... with_bbox=True, with_label_3d=True),
+ ... dict(type='DefaultFormatBundle3D', class_names=class_names),
+ ... dict(type='Collect3D',
+ ... keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+ ... ]
+ >>> assert expected_pipelines ==\
+ ... get_loading_pipeline(pipelines)
+ """
+ loading_pipeline = []
+ for transform in pipeline:
+ is_loading = is_loading_function(transform)
+ if is_loading is None: # MultiScaleFlipAug3D
+ # extract its inner pipeline
+ if isinstance(transform, dict):
+ inner_pipeline = transform.get('transforms', [])
+ else:
+ inner_pipeline = transform.transforms.transforms
+ loading_pipeline.extend(get_loading_pipeline(inner_pipeline))
+ elif is_loading:
+ loading_pipeline.append(transform)
+ assert len(loading_pipeline) > 0, \
+ 'The data pipeline in your config file must include ' \
+ 'loading step.'
+ return loading_pipeline
+
+
+def extract_result_dict(results, key):
+ """Extract and return the data corresponding to key in result dict.
+
+ ``results`` is a dict output from `pipeline(input_dict)`, which is the
+ loaded data from ``Dataset`` class.
+ The data terms inside may be wrapped in list, tuple and DataContainer, so
+ this function essentially extracts data from these wrappers.
+
+ Args:
+ results (dict): Data loaded using pipeline.
+ key (str): Key of the desired data.
+
+ Returns:
+ np.ndarray | torch.Tensor | None: Data term.
+ """
+ if key not in results.keys():
+ return None
+ # results[key] may be data or list[data] or tuple[data]
+ # data may be wrapped inside DataContainer
+ data = results[key]
+ if isinstance(data, (list, tuple)):
+ data = data[0]
+ if isinstance(data, DataContainer):
+ data = data._data
+ return data
+
diff --git a/mmcv/datasets/vad_custom_nuscenes_eval.py b/mmcv/datasets/vad_custom_nuscenes_eval.py
new file mode 100644
index 0000000..0285591
--- /dev/null
+++ b/mmcv/datasets/vad_custom_nuscenes_eval.py
@@ -0,0 +1,834 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+from mmcv.fileio.io import dump,load
+import torch
+import numpy as np
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import random
+from nuscenes.eval.common.loaders import load_gt, add_center_dist
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, DetectionMetricData,DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from IPython import embed
+from matplotlib import pyplot as plt
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+ PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.utils.data_classes import LidarPointCloud
+import mmcv
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+ metrics: DetectionMetrics,
+ detection_name: str,
+ min_recall: float,
+ dist_th_tp: float,
+ savepath: str = None,
+ ax: Axis = None) -> None:
+ """
+ Plot the true positive curve for the specified class.
+ :param md_list: DetectionMetricDataList instance.
+ :param metrics: DetectionMetrics instance.
+ :param detection_name:
+ :param min_recall: Minimum recall value.
+ :param dist_th_tp: The distance threshold used to determine matches.
+ :param savepath: If given, saves the the rendering here instead of displaying.
+ :param ax: Axes onto which to render.
+ """
+ # Get metric data for given detection class with tp distance threshold.
+
+ md = md_list[(detection_name, dist_th_tp)]
+ min_recall_ind = round(100 * min_recall)
+ if min_recall_ind <= md.max_recall_ind:
+ # For traffic_cone and barrier only a subset of the metrics are plotted.
+ rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+ ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+ else:
+ ylimit = 1.0
+
+ # Prepare axis.
+ if ax is None:
+ ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+ min_recall=min_recall)
+ ax.set_ylim(0, ylimit)
+
+ # Plot the recall vs. error curve for each tp metric.
+ for metric in TP_METRICS:
+ tp = metrics.get_label_tp(detection_name, metric)
+
+ # Plot only if we have valid data.
+ if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+ recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+ else:
+ recall, error = [], []
+
+ # Change legend based on tp value
+ if tp is np.nan:
+ label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+ elif min_recall_ind > md.max_recall_ind:
+ label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+ else:
+ label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+ if metric == 'trans_err':
+ label += f' ({md.max_recall_ind})' # add recall
+ print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+ ax.plot(recall, error, label=label)
+ ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+ ax.legend(loc='best')
+
+ if savepath is not None:
+ plt.savefig(savepath)
+ plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+ def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+ '''
+ add annotation token
+ '''
+ super().__init__(*args, **kwargs)
+ self.token = token
+ self.visibility = visibility
+ self.index = index
+
+ def serialize(self) -> dict:
+ """ Serialize instance into json-friendly format. """
+ return {
+ 'token': self.token,
+ 'sample_token': self.sample_token,
+ 'translation': self.translation,
+ 'size': self.size,
+ 'rotation': self.rotation,
+ 'velocity': self.velocity,
+ 'ego_translation': self.ego_translation,
+ 'num_pts': self.num_pts,
+ 'detection_name': self.detection_name,
+ 'detection_score': self.detection_score,
+ 'attribute_name': self.attribute_name,
+ 'visibility': self.visibility,
+ 'index': self.index
+
+ }
+
+ @classmethod
+ def deserialize(cls, content: dict):
+ """ Initialize from serialized content. """
+ return cls(
+ token=content['token'],
+ sample_token=content['sample_token'],
+ translation=tuple(content['translation']),
+ size=tuple(content['size']),
+ rotation=tuple(content['rotation']),
+ velocity=tuple(content['velocity']),
+ ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+ else tuple(content['ego_translation']),
+ num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+ detection_name=content['detection_name'],
+ detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+ attribute_name=content['attribute_name'],
+ visibility=content['visibility'],
+ index=content['index'],
+ )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible inside an image without accounting for occlusions.
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ center_3d = box.center.reshape(3, 1)
+ center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, center_img[1, :] > 0)
+ visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+ in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if vis_level == BoxVisibility.ALL:
+ return all(visible) and all(in_front)
+ elif vis_level == BoxVisibility.ANY:
+ return any(visible) and all(in_front)
+ elif vis_level == BoxVisibility.NONE:
+ return True
+ else:
+ raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+ vis_level: int = BoxVisibility.ANY) -> bool:
+ """
+ Check if a box is visible in images but not all corners in image .
+ :param box: The box to be checked.
+ :param intrinsic: . Intrinsic camera matrix.
+ :param imsize: (width, height).
+ :param vis_level: One of the enumerations of .
+ :return True if visibility condition is satisfied.
+ """
+
+ corners_3d = box.corners()
+ corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+ visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+ visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+ visible = np.logical_and(visible, corners_img[1, :] > 0)
+ visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+ in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera.
+
+ if any(visible) and not all(visible) and all(in_front):
+ return True
+ else:
+ return False
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \
+ -> Tuple[EvalBoxes, Dict]:
+ """
+ Loads object predictions from file.
+ :param result_path: Path to the .json result file provided by the user.
+ :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The deserialized results and meta data.
+ """
+
+ # Load from file and check that the format is correct.
+ # with open(result_path) as f:
+ # data = json.load(f)
+ data = load(result_path)
+ assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+ 'See https://www.nuscenes.org/object-detection for more information.'
+
+ # Deserialize results and get meta data.
+ all_results = EvalBoxes.deserialize(data['results'], box_cls)
+ meta = data['meta']
+ if verbose:
+ print("Loaded results from {}. Found detections for {} samples."
+ .format(result_path, len(all_results.sample_tokens)))
+
+ # Check that each sample has no more than x predicted boxes.
+ for sample_token in all_results.sample_tokens:
+ assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+ "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+ return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+ """
+ Loads ground truth boxes from DB.
+ :param nusc: A NuScenes instance.
+ :param eval_split: The evaluation split for which we load GT boxes.
+ :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+ :param verbose: Whether to print messages to stdout.
+ :return: The GT boxes.
+ """
+
+ # Init.
+ if box_cls == DetectionBox_modified:
+ attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+ if verbose:
+ print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+ # Read out all sample_tokens in DB.
+ sample_tokens_all = [s['token'] for s in nusc.sample]
+ assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+ # Only keep samples from this split.
+ splits = create_splits_scenes()
+
+ # Check compatibility of split with nusc_version.
+ version = nusc.version
+ if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+ assert version.endswith('trainval'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split in {'mini_train', 'mini_val'}:
+ assert version.endswith('mini'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ elif eval_split == 'test':
+ assert version.endswith('test'), \
+ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+ else:
+ raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+ .format(eval_split))
+
+ if eval_split == 'test':
+ # Check that you aren't trying to cheat :).
+ assert len(nusc.sample_annotation) > 0, \
+ 'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+ index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ index_map[sample['token']] = index
+ index += 1
+
+ sample_tokens = []
+ for sample_token in sample_tokens_all:
+ scene_token = nusc.get('sample', sample_token)['scene_token']
+ scene_record = nusc.get('scene', scene_token)
+ if scene_record['name'] in splits[eval_split]:
+ sample_tokens.append(sample_token)
+
+ all_annotations = EvalBoxes()
+
+ # Load annotations and filter predictions and annotations.
+ tracking_id_set = set()
+ for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+ sample = nusc.get('sample', sample_token)
+ sample_annotation_tokens = sample['anns']
+
+ sample_boxes = []
+ for sample_annotation_token in sample_annotation_tokens:
+
+ sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+ if box_cls == DetectionBox_modified:
+ # Get label name in detection task and filter unused labels.
+ detection_name = category_to_detection_name(sample_annotation['category_name'])
+ if detection_name is None:
+ continue
+
+ # Get attribute_name.
+ attr_tokens = sample_annotation['attribute_tokens']
+ attr_count = len(attr_tokens)
+ if attr_count == 0:
+ attribute_name = ''
+ elif attr_count == 1:
+ attribute_name = attribute_map[attr_tokens[0]]
+ else:
+ raise Exception('Error: GT annotations must not have more than one attribute!')
+
+ sample_boxes.append(
+ box_cls(
+ token=sample_annotation_token,
+ sample_token=sample_token,
+ translation=sample_annotation['translation'],
+ size=sample_annotation['size'],
+ rotation=sample_annotation['rotation'],
+ velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+ num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+ detection_name=detection_name,
+ detection_score=-1.0, # GT samples do not have a score.
+ attribute_name=attribute_name,
+ visibility=sample_annotation['visibility_token'],
+ index=index_map[sample_token]
+ )
+ )
+ elif box_cls == TrackingBox:
+ assert False
+ else:
+ raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+ all_annotations.add_boxes(sample_token, sample_boxes)
+
+ if verbose:
+ print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+ return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ id=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.token in id:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+ ori_eval_boxes: EvalBoxes,
+ visibility=None,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param is: the anns token set that used to keep bboxes.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.visibility == visibility:
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After visibility based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False):
+ eval_boxes = copy.deepcopy(ori_eval_boxes)
+ for sample_token in eval_boxes.sample_tokens:
+ if sample_token not in valid_sample_tokens:
+ eval_boxes.boxes.pop(sample_token)
+ return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. basedon overlap .
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param verbose: Whether to print to stdout.
+ """
+
+ # Accumulators for number of filtered boxes.
+ cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+ total, anns_filter = 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on anns
+ total += len(eval_boxes[sample_token])
+ sample_record = nusc.get('sample', sample_token)
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ count = 0
+ for cam in cams:
+ '''
+ copy-paste form nuscens
+ '''
+ sample_data_token = sample_record['data'][cam]
+ sd_record = nusc.get('sample_data', sample_data_token)
+ cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+ sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+ pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+ cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+ imsize = (sd_record['width'], sd_record['height'])
+ new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+ name=box.detection_name, token='')
+
+ # Move box to ego vehicle coord system.
+ new_box.translate(-np.array(pose_record['translation']))
+ new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+ # Move box to sensor coord system.
+ new_box.translate(-np.array(cs_record['translation']))
+ new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+ if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ count += 1
+ # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+ # count += 1
+
+ if count > 1:
+ with open('center_overlap.txt', 'a') as f:
+ try:
+ f.write(box.token + '\n')
+ except:
+ pass
+ filtered_boxes.append(box)
+ anns_filter += len(filtered_boxes)
+ eval_boxes.boxes[sample_token] = filtered_boxes
+
+ verbose = True
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After anns based filtering: %d" % anns_filter)
+
+ return eval_boxes
+
+def _get_box_class_field(eval_boxes: EvalBoxes) -> str:
+ """
+ Retrieve the name of the class field in the boxes.
+ This parses through all boxes until it finds a valid box.
+ If there are no valid boxes, this function throws an exception.
+ :param eval_boxes: The EvalBoxes used for evaluation.
+ :return: The name of the class field in the boxes, e.g. detection_name or tracking_name.
+ """
+ assert len(eval_boxes.boxes) > 0
+ box = None
+ for val in eval_boxes.boxes.values():
+ if len(val) > 0:
+ box = val[0]
+ break
+ if isinstance(box, DetectionBox):
+ class_field = 'detection_name'
+ elif isinstance(box, TrackingBox):
+ class_field = 'tracking_name'
+ else:
+ raise Exception('Error: Invalid box type: %s' % box)
+
+ return class_field
+
+def filter_eval_boxes(nusc: NuScenes,
+ eval_boxes: EvalBoxes,
+ max_dist_x: Dict[str, float],
+ max_dist_y: Dict[str, float],
+ verbose: bool = False) -> EvalBoxes:
+ """
+ Applies filtering to boxes. Distance, bike-racks and points per box.
+ :param nusc: An instance of the NuScenes class.
+ :param eval_boxes: An instance of the EvalBoxes class.
+ :param max_dist: Maps the detection name to the eval distance threshold for that class.
+ :param verbose: Whether to print to stdout.
+ """
+ # Retrieve box type for detectipn/tracking boxes.
+ class_field = _get_box_class_field(eval_boxes)
+
+ # Accumulators for number of filtered boxes.
+ total, dist_filter, point_filter, bike_rack_filter = 0, 0, 0, 0
+ for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+ # Filter on distance first.
+ total += len(eval_boxes[sample_token])
+ eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if
+ abs(box.ego_translation[0]) < max_dist_x[box.__getattribute__(class_field)] \
+ and abs(box.ego_translation[1]) < max_dist_y[box.__getattribute__(class_field)]]
+ dist_filter += len(eval_boxes[sample_token])
+
+ # Then remove boxes with zero points in them. Eval boxes have -1 points by default.
+ eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if not box.num_pts == 0]
+ point_filter += len(eval_boxes[sample_token])
+
+ # Perform bike-rack filtering.
+ sample_anns = nusc.get('sample', sample_token)['anns']
+ bikerack_recs = [nusc.get('sample_annotation', ann) for ann in sample_anns if
+ nusc.get('sample_annotation', ann)['category_name'] == 'static_object.bicycle_rack']
+ bikerack_boxes = [Box(rec['translation'], rec['size'], Quaternion(rec['rotation'])) for rec in bikerack_recs]
+ filtered_boxes = []
+ for box in eval_boxes[sample_token]:
+ if box.__getattribute__(class_field) in ['bicycle', 'motorcycle']:
+ in_a_bikerack = False
+ for bikerack_box in bikerack_boxes:
+ if np.sum(points_in_box(bikerack_box, np.expand_dims(np.array(box.translation), axis=1))) > 0:
+ in_a_bikerack = True
+ if not in_a_bikerack:
+ filtered_boxes.append(box)
+ else:
+ filtered_boxes.append(box)
+
+ eval_boxes.boxes[sample_token] = filtered_boxes
+ bike_rack_filter += len(eval_boxes.boxes[sample_token])
+
+ if verbose:
+ print("=> Original number of boxes: %d" % total)
+ print("=> After distance based filtering: %d" % dist_filter)
+ print("=> After LIDAR and RADAR points based filtering: %d" % point_filter)
+ print("=> After bike rack filtering: %d" % bike_rack_filter)
+
+ return eval_boxes
+
+class NuScenesEval_custom(NuScenesEval):
+ """
+ Dummy class for backward-compatibility. Same as DetectionEval.
+ """
+
+ def __init__(self,
+ nusc: NuScenes,
+ config: DetectionConfig,
+ result_path: str,
+ eval_set: str,
+ output_dir: str = None,
+ verbose: bool = True,
+ overlap_test=False,
+ eval_mask=False,
+ data_infos=None
+ ):
+ """
+ Initialize a DetectionEval object.
+ :param nusc: A NuScenes object.
+ :param config: A DetectionConfig object.
+ :param result_path: Path of the nuScenes JSON result file.
+ :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+ :param output_dir: Folder to save plots and results to.
+ :param verbose: Whether to print to stdout.
+ """
+
+ self.nusc = nusc
+ self.result_path = result_path
+ self.eval_set = eval_set
+ self.output_dir = output_dir
+ self.verbose = verbose
+ self.cfg = config
+ self.overlap_test = overlap_test
+ self.eval_mask = eval_mask
+ self.data_infos = data_infos
+ # Check result file exists.
+ assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+ # Make dirs.
+ self.plot_dir = os.path.join(self.output_dir, 'plots')
+ if not os.path.isdir(self.output_dir):
+ os.makedirs(self.output_dir)
+ if not os.path.isdir(self.plot_dir):
+ os.makedirs(self.plot_dir)
+
+ # Load data.
+ if verbose:
+ print('Initializing nuScenes detection evaluation')
+ self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+ verbose=verbose)
+ self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+ # assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+ # "Samples in split doesn't match samples in predictions."
+
+ # Add center distances.
+ self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+ self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+ # Filter boxes (distance, points per box, etc.).
+
+ if verbose:
+ print('Filtering predictions')
+ self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose)
+ if verbose:
+ print('Filtering ground truth annotations')
+ self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose)
+
+ if self.overlap_test:
+ self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+ self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+ self.all_gt = copy.deepcopy(self.gt_boxes)
+ self.all_preds = copy.deepcopy(self.pred_boxes)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+ self.index_map = {}
+ for scene in nusc.scene:
+ first_sample_token = scene['first_sample_token']
+ sample = nusc.get('sample', first_sample_token)
+ self.index_map[first_sample_token] = 1
+ index = 2
+ while sample['next'] != '':
+ sample = nusc.get('sample', sample['next'])
+ self.index_map[sample['token']] = index
+ index += 1
+
+ def update_gt(self, type_='vis', visibility='1', index=1):
+ if type_ == 'vis':
+ self.visibility_test = True
+ if self.visibility_test:
+ '''[{'description': 'visibility of whole object is between 0 and 40%',
+ 'token': '1',
+ 'level': 'v0-40'},
+ {'description': 'visibility of whole object is between 40 and 60%',
+ 'token': '2',
+ 'level': 'v40-60'},
+ {'description': 'visibility of whole object is between 60 and 80%',
+ 'token': '3',
+ 'level': 'v60-80'},
+ {'description': 'visibility of whole object is between 80 and 100%',
+ 'token': '4',
+ 'level': 'v80-100'}]'''
+
+ self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+ elif type_ == 'ord':
+
+ valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+ # from IPython import embed
+ # embed()
+ self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+ self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+ self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+ def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+ """
+ Performs the actual evaluation.
+ :return: A tuple of high-level and the raw metric data.
+ """
+ start_time = time.time()
+
+ # -----------------------------------
+ # Step 1: Accumulate metric data for all classes and distance thresholds.
+ # -----------------------------------
+ if self.verbose:
+ print('Accumulating metric data...')
+ metric_data_list = DetectionMetricDataList()
+
+ # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+ # self.cfg.dist_ths = [0.3]
+ # self.cfg.dist_fcn_callable
+ for class_name in self.cfg.class_names:
+ for dist_th in self.cfg.dist_ths:
+ md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+ metric_data_list.set(class_name, dist_th, md)
+
+ # -----------------------------------
+ # Step 2: Calculate metrics from the data.
+ # -----------------------------------
+ if self.verbose:
+ print('Calculating metrics...')
+ metrics = DetectionMetrics(self.cfg)
+ for class_name in self.cfg.class_names:
+ # Compute APs.
+ for dist_th in self.cfg.dist_ths:
+ metric_data = metric_data_list[(class_name, dist_th)]
+ ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+ metrics.add_label_ap(class_name, dist_th, ap)
+ # Compute TP metrics.
+ for metric_name in TP_METRICS:
+ metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+ if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+ tp = np.nan
+ elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+ tp = np.nan
+ else:
+ tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+ metrics.add_label_tp(class_name, metric_name, tp)
+
+ # Compute evaluation time.
+ metrics.add_runtime(time.time() - start_time)
+
+ return metrics, metric_data_list
+
+ def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+ """
+ Renders various PR and TP curves.
+ :param metrics: DetectionMetrics instance.
+ :param md_list: DetectionMetricDataList instance.
+ """
+ if self.verbose:
+ print('Rendering PR and TP curves')
+
+ def savepath(name):
+ return os.path.join(self.plot_dir, name + '.pdf')
+
+ summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+ dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+ for detection_name in self.cfg.class_names:
+ class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath(detection_name + '_pr'))
+
+ class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+ savepath=savepath(detection_name + '_tp'))
+
+ for dist_th in self.cfg.dist_ths:
+ dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+ savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+ # Settings.
+ parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+ parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+ help='Folder to store result metrics, graphs and example visualizations.')
+ parser.add_argument('--eval_set', type=str, default='val',
+ help='Which dataset split to evaluate on, train, val or test.')
+ parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+ help='Default nuScenes data directory.')
+ parser.add_argument('--version', type=str, default='v1.0-trainval',
+ help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+ parser.add_argument('--config_path', type=str, default='',
+ help='Path to the configuration file.'
+ 'If no path given, the CVPR 2019 configuration will be used.')
+ parser.add_argument('--plot_examples', type=int, default=0,
+ help='How many example visualizations to write to disk.')
+ parser.add_argument('--render_curves', type=int, default=1,
+ help='Whether to render PR and TP curves to disk.')
+ parser.add_argument('--verbose', type=int, default=1,
+ help='Whether to print to stdout.')
+ args = parser.parse_args()
+
+ result_path_ = os.path.expanduser(args.result_path)
+ output_dir_ = os.path.expanduser(args.output_dir)
+ eval_set_ = args.eval_set
+ dataroot_ = args.dataroot
+ version_ = args.version
+ config_path = args.config_path
+ plot_examples_ = args.plot_examples
+ render_curves_ = bool(args.render_curves)
+ verbose_ = bool(args.verbose)
+
+ if config_path == '':
+ cfg_ = config_factory('detection_cvpr_2019')
+ else:
+ with open(config_path, 'r') as _f:
+ cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+ nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+ nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+ output_dir=output_dir_, verbose=verbose_)
+ for vis in ['1', '2', '3', '4']:
+ nusc_eval.update_gt(type_='vis', visibility=vis)
+ print(f'================ {vis} ===============')
+ nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+ #for index in range(1, 41):
+ # nusc_eval.update_gt(type_='ord', index=index)
+ #
diff --git a/mmcv/datasets/vis_utils.py b/mmcv/datasets/vis_utils.py
new file mode 100644
index 0000000..281703f
--- /dev/null
+++ b/mmcv/datasets/vis_utils.py
@@ -0,0 +1,670 @@
+import numpy as np
+import cv2
+from matplotlib import cm
+import math
+import open3d as o3d
+import os
+
+WINDOW_HEIGHT = 900
+WINDOW_WIDTH = 1600
+
+DIS_CAR_SAVE = 50
+DIS_WALKER_SAVE = 50
+DIS_SIGN_SAVE = 50
+DIS_LIGHT_SAVE = 50
+
+edges = [[0,1], [1,3], [3,2], [2,0], [0,4], [4,5], [5,1], [5,7], [7,6], [6,4], [6,2], [7,3]]
+
+carla_bbox_edges = [
+ (0, 1), (1, 2), (2, 3), (3, 0), # Bottom face
+ (4, 5), (5, 6), (6, 7), (7, 4), # Top face
+ (0, 4), (1, 5), (2, 6), (3, 7) # Side edges connecting top and bottom faces
+]
+
+VIRIDIS = np.array(cm.get_cmap('plasma').colors)
+VID_RANGE = np.linspace(0.0, 1.0, VIRIDIS.shape[0])
+LABEL_COLORS = np.array([
+ (255, 255, 255), # None
+ (70, 70, 70), # Building
+ (100, 40, 40), # Fences
+ (55, 90, 80), # Other
+ (220, 20, 60), # Pedestrian
+ (153, 153, 153), # Pole
+ (157, 234, 50), # RoadLines
+ (128, 64, 128), # Road
+ (244, 35, 232), # Sidewalk
+ (107, 142, 35), # Vegetation
+ (0, 0, 142), # Vehicle
+ (102, 102, 156), # Wall
+ (220, 220, 0), # TrafficSign
+ (70, 130, 180), # Sky
+ (81, 0, 81), # Ground
+ (150, 100, 100), # Bridge
+ (230, 150, 140), # RailTrack
+ (180, 165, 180), # GuardRail
+ (250, 170, 30), # TrafficLight
+ (110, 190, 160), # Static
+ (170, 120, 50), # Dynamic
+ (45, 60, 150), # Water
+ (145, 170, 100), # Terrain
+]) / 255.0 # normalize each channel [0-1] since is what Open3D uses
+
+SEM_SEG_LABEL_COLORS = {
+ 0 : ( 0, 0, 0), # unlabeled
+ # cityscape
+ 1 : (128, 64, 128), # road
+ 2 : (244, 35, 232), # sidewalk
+ 3 : ( 70, 70, 70), # building
+ 4 : (102, 102, 156), # wall
+ 5 : (190, 153, 153), # fence
+ 6 : (153, 153, 153), # pole
+ 7 : (250, 170, 30), # traffic light
+ 8 : (220, 220, 0), # traffic sign
+ 9 : (107, 142, 35), # vegetation
+ 10 : (152, 251, 152), # terrain
+ 11 : ( 70, 130, 180), # sky
+ 12 : (220, 20, 60), # pedestrian
+ 13 : (255, 0, 0), # rider
+ 14 : ( 0, 0, 142), # Car
+ 15 : ( 0, 0, 70), # truck
+ 16 : ( 0, 60, 100), # bus
+ 17 : ( 0, 80, 100), # train
+ 18 : ( 0, 0, 230), # motorcycle
+ 19 : (119, 11, 32), # bicycle
+ # custom
+ 20 : (110, 190, 160), # static
+ 21 : (170, 120, 50), # dynamic
+ 22 : ( 55, 90, 80), # other
+ 23 : ( 45, 60, 150), # water
+ 24 : (157, 234, 50), # road line
+ 25 : ( 81, 0, 81), # ground
+ 26 : (150, 100, 100), # bridge
+ 27 : (230, 150, 140), # rail track
+ 28 : (180, 165, 180), # guard rail
+}
+
+uniad_class_names = [
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+ 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+
+carla_class_name = [
+ 'car', 'truck', 'bus', 'van', 'motorcycle', 'bicycle', 'pedestrian',
+]
+
+TYPE_ID_MAP = {
+ #=================vehicle=================
+ # bicycle
+ 'vehicle.bh.crossbike': 'bicycle',
+ "vehicle.diamondback.century": 'bicycle',
+ # car
+ "vehicle.chevrolet.impala": 'car',
+ "vehicle.dodge.charger_2020": 'car',
+ "vehicle.dodge.charger_police_2020": 'car',
+ "vehicle.lincoln.mkz_2017": 'car',
+ "vehicle.lincoln.mkz_2020": 'car',
+ "vehicle.mini.cooper_s_2021": 'car',
+ "vehicle.mercedes.coupe_2020": 'car',
+ "vehicle.ford.mustang": 'car',
+ "vehicle.nissan.patrol_2021": 'car',
+ "vehicle.audi.tt": 'car',
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+ # bus
+ # van
+ "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+ #=========================================
+
+ #=================traffic sign============
+ # traffic.speed_limit
+ "traffic.speed_limit.30": 'speed_limit',
+ "traffic.speed_limit.40": 'speed_limit',
+ "traffic.speed_limit.50": 'speed_limit',
+ # traffic.traffic_light
+ "traffic.traffic_light": 'traffic_light',
+ # traffic.stop
+ "traffic.stop": 'stop',
+ #=========================================
+}
+
+def calc_projected_2d_bbox(vertices_pos2d):
+ """ Takes in all vertices in pixel projection and calculates min and max of all x and y coordinates.
+ Returns left top, right bottom pixel coordinates for the 2d bounding box as a list of four values.
+ Note that vertices_pos2d contains a list of (y_pos2d, x_pos2d) tuples, or None
+ """
+ x_coords = vertices_pos2d[:, 0]
+ y_coords = vertices_pos2d[:, 1]
+ min_x, max_x = np.min(x_coords), np.max(x_coords)
+ min_y, max_y = np.min(y_coords), np.max(y_coords)
+ return [min_x, min_y, max_x, max_y]
+
+def calculate_occlusion(bbox, point_depth, agent, depth_map):
+ """Calculate the occlusion value of a 2D bounding box.
+ Iterate through each point (pixel) in the bounding box and declare it occluded only
+ if the 4 surroinding points (pixels) are closer to the camera (by using the help of depth map)
+ than the actual distance to the middle of the 3D bounding boxe and some margin (the extent of the object)
+ """
+ bbox_3d_mid = np.mean(point_depth)
+ min_x, min_y, max_x, max_y = calc_projected_2d_bbox(bbox)
+ height, width, length = agent.bounding_box.extent.z, agent.bounding_box.extent.x, agent.bounding_box.extent.y
+
+ #depth_margin should depend on the rotation of the object but this solution works fine
+ depth_margin = np.max([2 * width, 2 * length])
+ is_occluded = []
+
+ for x in range(int(min_x), int(max_x)):
+ for y in range(int(min_y), int(max_y)):
+ is_occluded.append(point_is_occluded(
+ (y, x), bbox_3d_mid - depth_margin, depth_map))
+
+ occlusion = ((float(np.sum(is_occluded))) / ((max_x-min_x) * (max_y-min_y)))
+ #discretize the 0–1 occlusion value into KITTI’s {0,1,2,3} labels by equally dividing the interval into 4 parts
+ # occlusion = np.digitize(occlusion, bins=[0.25, 0.50, 0.75])
+ return occlusion
+
+def calculate_occlusion_vectorized(bbox, point_depth, extent, depth_map):
+ """Calculate the occlusion value of a 2D bounding box.
+ Iterate through each point (pixel) in the bounding box and declare it occluded only
+ if the 4 surroinding points (pixels) are closer to the camera (by using the help of depth map)
+ than the actual distance to the middle of the 3D bounding boxe and some margin (the extent of the object)
+ """
+ bbox_3d_mid = np.mean(point_depth)
+ min_x, min_y, max_x, max_y = calc_projected_2d_bbox(bbox)
+ height, width, length = extent[2], extent[0], extent[1]
+ depth_margin = np.max([2 * width, 2 * length])
+ count_num = (max_x - min_x) * (max_y - min_y)
+ if count_num > 10000:
+ p = 100 / count_num
+ elif count_num > 1000:
+ p = 100 / count_num
+ elif count_num > 100:
+ p = 100 / count_num
+ else:
+ p = 1
+ sample_step_approx = int(np.sqrt(1/p))
+
+ # x, y = np.meshgrid(np.arange(min_x, max_x), np.arange(min_y, max_y))
+ x, y = np.meshgrid(np.arange(min_x, max_x, sample_step_approx), np.arange(min_y, max_y, sample_step_approx))
+ points = np.stack((y.flatten(), x.flatten()), axis=1)
+ is_occluded_array = point_is_occluded_single(points, bbox_3d_mid - depth_margin, depth_map)
+ occlusion = is_occluded_array.mean()
+ #discretize the 0–1 occlusion value into KITTI’s {0,1,2,3} labels by equally dividing the interval into 4 parts
+ # occlusion = np.digitize(occlusion, bins=[0.25, 0.50, 0.75])
+ return occlusion
+
+def calc_bbox2d_area(bbox_2d):
+ """ Calculate the area of the given 2d bbox
+ Input is assumed to be xmin, ymin, xmax, ymax tuple
+ """
+ xmin, ymin, xmax, ymax = bbox_2d
+ return (ymax - ymin) * (xmax - xmin)
+
+def calculate_truncation(uncropped_bbox, cropped_bbox):
+ "Calculate how much of the object's 2D uncropped bounding box is outside the image boundary"
+
+ area_cropped = calc_bbox2d_area(cropped_bbox)
+ area_uncropped = calc_bbox2d_area(uncropped_bbox)
+ truncation = 1.0 - float(area_cropped / area_uncropped)
+ return truncation
+
+def crop_boxes_in_canvas(cam_bboxes):
+ neg_x_inds = np.where(cam_bboxes[:, 0] < 0)[0]
+ out_x_inds = np.where(cam_bboxes[:, 0] > WINDOW_WIDTH)[0]
+ neg_y_inds = np.where(cam_bboxes[:, 1] < 0)[0]
+ out_y_inds = np.where(cam_bboxes[:, 1] > WINDOW_HEIGHT)[0]
+ cam_bboxes[neg_x_inds, 0] = 0
+ cam_bboxes[out_x_inds, 0] = WINDOW_HEIGHT
+ cam_bboxes[neg_y_inds, 1] = 0
+ cam_bboxes[out_y_inds, 1] = WINDOW_WIDTH
+ return cam_bboxes
+
+def point_is_occluded(point, vertex_depth, depth_map):
+ """ Checks whether or not the four pixels directly around the given point has less depth than the given vertex depth
+ If True, this means that the point is occluded.
+ """
+ y, x = map(int, point)
+ from itertools import product
+ neigbours = product((1, -1), repeat=2)
+ is_occluded = []
+ for dy, dx in neigbours:
+ if point_in_canvas_hw((dy+y, dx+x)):
+ # If the depth map says the pixel is closer to the camera than the actual vertex
+ if depth_map[y+dy, x+dx] < vertex_depth:
+ is_occluded.append(True)
+ else:
+ is_occluded.append(False)
+ # Only say point is occluded if all four neighbours are closer to camera than vertex
+ return all(is_occluded)
+
+def point_is_occluded_single(points, vertex_depth, depth_map, canvas_shape=(WINDOW_HEIGHT, WINDOW_WIDTH)):
+ '''
+ Simplified version that checks occlusion based only on the points' own depth
+ '''
+ points = np.asarray(points).astype(np.int32)
+ y, x = points[:, 0], points[:, 1]
+
+ valid = (y >= 0) & (y < canvas_shape[0]) & \
+ (x >= 0) & (x < canvas_shape[1])
+
+ is_occluded = np.zeros(len(points), dtype=bool)
+ try:
+ is_occluded[valid] = depth_map[y[valid], x[valid]] < vertex_depth
+ except:
+ pass
+ return is_occluded
+
+def point_is_occluded_vectorized(points, vertex_depth, depth_map, canvas_shape=(WINDOW_HEIGHT, WINDOW_WIDTH)):
+ '''
+ Equivalent to point_is_occluded
+ '''
+ points = np.asarray(points).astype(np.int32)
+ y, x = points[:, 0], points[:, 1]
+
+ dy, dx = np.array([1, 1, -1, -1]), np.array([1, -1, 1, -1])
+ neighbour_y = y[:, np.newaxis] + dy
+ neighbour_x = x[:, np.newaxis] + dx
+
+ valid = (neighbour_y >= 0) & (neighbour_y < canvas_shape[0]) & \
+ (neighbour_x >= 0) & (neighbour_x < canvas_shape[1])
+
+ neighbour_depths = np.full(neighbour_y.shape, np.inf)
+ for i in range(4):
+ mask = valid[:, i]
+ neighbour_depths[mask, i] = depth_map[neighbour_y[mask, i], neighbour_x[mask, i]]
+
+ is_occluded = np.logical_and.reduce(neighbour_depths < vertex_depth, axis=1) & np.any(valid, axis=1)
+ return is_occluded
+
+def draw_3d_bbox_vertex(image, points):
+ for x_2d, y_2d, vertex_color in points:
+ cv2.circle(image, (int(x_2d), int(y_2d)), radius=3, color=vertex_color, thickness=1)
+
+def calculate_occlusion_stats(bbox_points, depth, depth_map, max_render_depth):
+ """ Draws each vertex in vertices_pos2d if it is in front of the camera
+ The color is based on whether the object is occluded or not.
+ Returns the number of visible vertices and the number of vertices outside the camera.
+ """
+ num_visible_vertices = 0
+ num_invisible_vertices = 0
+ num_vertices_outside_camera = 0
+ points = []
+
+ for i in range(len(bbox_points)):
+ x_2d = bbox_points[i][0]
+ y_2d = bbox_points[i][1]
+ point_depth = depth[i]
+
+ # if the point is in front of the camera but not too far away
+ if max_render_depth > point_depth > 0 and point_in_canvas_hw((y_2d, x_2d)):
+ #is_occluded_v = point_is_occluded_vectorized([[y_2d, x_2d]], point_depth, depth_map)
+ is_occluded = point_is_occluded(
+ (y_2d, x_2d), point_depth, depth_map)
+
+ if is_occluded:
+ vertex_color = (0,0,255) # bgr, red
+ num_invisible_vertices += 1
+ else:
+ num_visible_vertices += 1
+ vertex_color = (0,255,0) # bgr, green
+ points.append((x_2d, y_2d, vertex_color))
+ else:
+ num_vertices_outside_camera += 1
+ return num_visible_vertices, num_invisible_vertices, num_vertices_outside_camera, points
+
+def get_intrinsic_matrix(camera):
+
+ width = int(camera.attributes['image_size_x'])
+ height = int(camera.attributes['image_size_y'])
+ fov = float(camera.attributes['fov'])
+
+ k = np.identity(3)
+ k[0, 2] = width / 2.0
+ k[1, 2] = height / 2.0
+ k[0, 0] = k[1, 1] = width / (2.0 * np.tan(fov * np.pi / 360.0))
+
+ return k
+
+def get_image_point(loc, K, w2c):
+ # Calculate 2D projection of 3D coordinate
+
+ # Format the input coordinate (loc is a carla.Position object)
+ point = np.array([loc[0], loc[1], loc[2], 1])
+ # transform to camera coordinates
+ point_camera = np.dot(w2c, point)
+
+ # New we must change from UE4's coordinate system to an "standard"
+ # (x, y ,z) -> (y, -z, x)
+ # and we remove the fourth componebonent also
+ point_camera = [point_camera[1], -point_camera[2], point_camera[0]]
+
+ depth = point_camera[2]
+
+ # now project 3D->2D using the camera matrix
+ point_img = np.dot(K, point_camera)
+ # normalize
+ point_img[0] /= point_img[2]
+ point_img[1] /= point_img[2]
+
+ return point_img[0:2], depth
+
+def point_in_canvas_hw(pos):
+ """Return true if point is in canvas"""
+ if (pos[0] >= 0) and (pos[0] < WINDOW_HEIGHT) and (pos[1] >= 0) and (pos[1] < WINDOW_WIDTH):
+ return True
+ return False
+
+def point_in_canvas_wh(pos):
+ """Return true if point is in canvas"""
+ if (pos[0] >= 0) and (pos[0] < WINDOW_WIDTH) and (pos[1] >= 0) and (pos[1] < WINDOW_HEIGHT):
+ return True
+ return False
+
+def build_projection_matrix(w, h, fov, is_behind_camera=False):
+ focal = w / (2.0 * np.tan(fov * np.pi / 360.0))
+ K = np.identity(3)
+
+ if is_behind_camera:
+ K[0, 0] = K[1, 1] = -focal
+ else:
+ K[0, 0] = K[1, 1] = focal
+
+ K[0, 2] = w / 2.0
+ K[1, 2] = h / 2.0
+ return K
+
+def rotate_3d(vector, theta):
+ theta = np.radians(theta)
+ R = np.array([[np.cos(theta), -np.sin(theta), 0],
+ [np.sin(theta), np.cos(theta), 0],
+ [0, 0, 1]])
+
+ v_rotated = np.dot(R, vector)
+ return np.array([v_rotated[0], v_rotated[1], v_rotated[2]])
+
+def normalize_angle_degree(x):
+ x = x % 360.0
+ if x > 180.0:
+ x -= 360.0
+ return x
+
+
+def algin_lidar(lidar, translation, yaw):
+ """
+ Translates and rotates a LiDAR into a new coordinate system.
+ Rotation is inverse to translation and yaw
+ :param lidar: numpy LiDAR point cloud (N,3)
+ :param translation: translations in meters
+ :param yaw: yaw angle in radians
+ :return: numpy LiDAR point cloud in the new coordinate system.
+ """
+
+ rotation_matrix = np.array([[np.cos(yaw), -np.sin(yaw), 0.0], [np.sin(yaw), np.cos(yaw), 0.0], [0.0, 0.0, 1.0]])
+
+ aligned_lidar = (rotation_matrix.T @ (lidar - translation).T).T
+
+ return aligned_lidar
+
+def convert_depth(data):
+ """
+ Computes the normalized depth from a CARLA depth map.
+ """
+ data = data.astype(np.float16)
+
+ normalized = np.dot(data, [65536.0, 256.0, 1.0])
+ normalized /= (256 * 256 * 256 - 1)
+ return normalized * 1000
+
+def get_relative_transform(ego_matrix, vehicle_matrix):
+ """
+ Returns the position of the vehicle matrix in the ego coordinate system.
+ :param ego_matrix: ndarray 4x4 Matrix of the ego vehicle in global
+ coordinates
+ :param vehicle_matrix: ndarray 4x4 Matrix of another actor in global
+ coordinates
+ :return: ndarray position of the other vehicle in the ego coordinate system
+ """
+ relative_pos = vehicle_matrix[:3, 3] - ego_matrix[:3, 3]
+ rot = ego_matrix[:3, :3].T
+ relative_pos = rot @ relative_pos
+
+ return relative_pos
+
+def normalize_angle(x):
+ x = x % (2 * np.pi) # force in range [0, 2 pi)
+ if x > np.pi: # move to [-pi, pi)
+ x -= 2 * np.pi
+ return x
+
+def build_skeleton(ped, sk_links):
+
+ ######## get the pedestrian skeleton #########
+ bones = ped.get_bones()
+
+ # list where we will store the lines we will project
+ # onto the camera output
+ lines_3d = []
+
+ # cycle through the bone pairs in skeleton.txt and retrieve the joint positions
+ for link in sk_links[1:]:
+
+ # get the roots of the two bones to be joined
+ bone_transform_1 = next(filter(lambda b: b.name == link[0], bones.bone_transforms), None)
+ bone_transform_2 = next(filter(lambda b: b.name == link[1], bones.bone_transforms), None)
+
+ # some bone names aren't matched
+ if bone_transform_1 is not None and bone_transform_2 is not None:
+ lines_3d.append([(bone_transform_1.world.location.x, bone_transform_1.world.location.y, bone_transform_1.world.location.z),
+ (bone_transform_2.world.location.x, bone_transform_2.world.location.y, bone_transform_2.world.location.z)]
+ )
+ return lines_3d
+
+def get_center_and_extent(verts):
+ sum_x = sum_y = sum_z = 0
+ max_x = max_y = max_z = float('-inf')
+ min_x = min_y = min_z = float('inf')
+
+ for pos in verts:
+ sum_x += pos.x
+ sum_y += pos.y
+ sum_z += pos.z
+
+ max_x = max(max_x, pos.x)
+ max_y = max(max_y, pos.y)
+ max_z = max(max_z, pos.z)
+
+ min_x = min(min_x, pos.x)
+ min_y = min(min_y, pos.y)
+ min_z = min(min_z, pos.z)
+
+ center = (sum_x / 8, sum_y / 8, sum_z / 8)
+
+ extent = ((max_x - min_x)/2, (max_y - min_y)/2, (max_z - min_z)/2)
+ return center, extent
+
+def get_forward_vector(yaw):
+
+ yaw_rad = math.radians(yaw)
+
+ x = math.cos(yaw_rad)
+ y = math.sin(yaw_rad)
+
+ z = 0
+ return np.array([x, y, z])
+
+def calculate_cube_vertices(center, extent):
+ cx, cy, cz = center
+ x, y, z = extent
+ vertices = [
+ (cx + x, cy + y, cz + z),
+ (cx + x, cy + y, cz - z),
+ (cx + x, cy - y, cz + z),
+ (cx + x, cy - y, cz - z),
+ (cx - x, cy + y, cz + z),
+ (cx - x, cy + y, cz - z),
+ (cx - x, cy - y, cz + z),
+ (cx - x, cy - y, cz - z)
+ ]
+ return vertices
+
+
+def calculate_cube_vertices_2(center, extent):
+ cx, cy, cz = center.x, center.y, center.z
+ x, y, z = extent.x, extent.y, extent.z
+ vertices = [
+ (cx + x, cy + y, cz + z),
+ (cx + x, cy + y, cz - z),
+ (cx + x, cy - y, cz + z),
+ (cx + x, cy - y, cz - z),
+ (cx - x, cy + y, cz + z),
+ (cx - x, cy + y, cz - z),
+ (cx - x, cy - y, cz + z),
+ (cx - x, cy - y, cz - z)
+ ]
+ return vertices
+
+def calculate_cube_vertices_3(center, extent):
+ cx, cy, cz = center[0], center[1], center[2]
+ x, y, z = extent[0], extent[1], extent[2]
+ vertices = [
+ (cx + x, cy + y, cz + z),
+ (cx + x, cy + y, cz - z),
+ (cx + x, cy - y, cz + z),
+ (cx + x, cy - y, cz - z),
+ (cx - x, cy + y, cz + z),
+ (cx - x, cy + y, cz - z),
+ (cx - x, cy - y, cz + z),
+ (cx - x, cy - y, cz - z)
+ ]
+ return vertices
+
+
+
+
+def draw_dashed_line(img, start_point, end_point, color, thickness=1, dash_length=5):
+
+ d = np.sqrt((end_point[0] - start_point[0])**2 + (end_point[1] - start_point[1])**2)
+ dx = (end_point[0] - start_point[0]) / d
+ dy = (end_point[1] - start_point[1]) / d
+
+ x, y = start_point[0], start_point[1]
+
+ while d >= dash_length:
+
+ x_end = x + dx * dash_length
+ y_end = y + dy * dash_length
+ cv2.line(img, (int(x), int(y)), (int(x_end), int(y_end)), color, thickness)
+ x = x_end + dx * dash_length
+ y = y_end + dy * dash_length
+ d -= 2 * dash_length
+
+def get_matrix(location, rotation):
+ """
+ Creates matrix from carla transform.
+ """
+ pitch, roll, yaw = rotation
+ x, y, z = location
+ c_y = np.cos(np.radians(yaw))
+ s_y = np.sin(np.radians(yaw))
+ c_r = np.cos(np.radians(roll))
+ s_r = np.sin(np.radians(roll))
+ c_p = np.cos(np.radians(pitch))
+ s_p = np.sin(np.radians(pitch))
+ matrix = np.matrix(np.identity(4))
+ matrix[0, 3] = x
+ matrix[1, 3] = y
+ matrix[2, 3] = z
+ matrix[0, 0] = c_p * c_y
+ matrix[0, 1] = c_y * s_p * s_r - s_y * c_r
+ matrix[0, 2] = -c_y * s_p * c_r - s_y * s_r
+ matrix[1, 0] = s_y * c_p
+ matrix[1, 1] = s_y * s_p * s_r + c_y * c_r
+ matrix[1, 2] = -s_y * s_p * c_r + c_y * s_r
+ matrix[2, 0] = s_p
+ matrix[2, 1] = -c_p * s_r
+ matrix[2, 2] = c_p * c_r
+ return matrix
+
+def euler_to_rotation_matrix(pitch, roll, yaw):
+ Ry_pitch = np.array([
+ [np.cos(pitch), 0, np.sin(pitch)],
+ [0, 1, 0],
+ [-np.sin(pitch), 0, np.cos(pitch)]
+ ])
+ Rx_roll = np.array([
+ [1, 0, 0],
+ [0, np.cos(roll), -np.sin(roll)],
+ [0, np.sin(roll), np.cos(roll)]
+ ])
+ Rz_yaw = np.array([
+ [np.cos(yaw), -np.sin(yaw), 0],
+ [np.sin(yaw), np.cos(yaw), 0],
+ [0, 0, 1]
+ ])
+ return np.dot(Rz_yaw, np.dot(Rx_roll, Ry_pitch))
+
+def world_to_ego_no(point_world, ego_location, ego_rotation):
+ rotation_matrix = euler_to_rotation_matrix(np.radians(ego_rotation[0]),
+ np.radians(ego_rotation[1]),
+ np.radians(ego_rotation[2]))
+
+ point_relative = np.array(point_world) - np.array(ego_location)
+ point = np.dot(rotation_matrix, point_relative)
+ # (x, y ,z) -> (y, -x, z)
+ point = [point[0], -point[1], point[2]]
+ return point
+
+def world_to_ego(point_world, w2e):
+ point_world = np.array([point_world[0], point_world[1], point_world[2], 1])
+ point_ego = np.dot(w2e, point_world)
+ point_ego = [point_ego[1], -point_ego[0], point_ego[2]]
+ return point_ego
+
+def world_to_lidar(point_world, w2l):
+ point_world = np.array([point_world[0], point_world[1], point_world[2], 1])
+ point_lidar = np.dot(w2l, point_world)
+ return point_lidar
+
+def vector_angle(v1, v2):
+ dot_product = np.dot(v1, v2)
+ magnitude_v1 = np.linalg.norm(v1)
+ magnitude_v2 = np.linalg.norm(v2)
+ cos_theta = dot_product / (magnitude_v1 * magnitude_v2)
+ angle_radians = np.arccos(cos_theta)
+ angle_degrees = np.degrees(angle_radians)
+ return angle_degrees
+
+def get_weather_id(weather_conditions):
+ from xml.etree import ElementTree as ET
+ tree = ET.parse('./weather.xml')
+ root = tree.getroot()
+ def conditions_match(weather, conditions):
+ for (key, value) in weather:
+ if key == 'route_percentage' : continue
+ if str(conditions[key]) != value:
+ return False
+ return True
+ for case in root.findall('case'):
+ weather = case[0].items()
+ if conditions_match(weather, weather_conditions):
+ return case.items()[0][1]
+ return None
+
+
+def static_weather(path):
+ import gzip
+ import json
+ static_dict = {}
+ for dir in os.listdir(path):
+ for d1 in os.listdir(os.path.join(path, dir)):
+ if os.path.exists(os.path.join(path, dir, d1, 'anno/00000.json.gz')):
+ with gzip.open(os.path.join(path, dir, d1, 'anno/00000.json.gz'), 'rt', encoding='utf-8') as gz_file:
+ anno = json.load(gz_file)
+ weather = anno['weather']
+ weather_id = get_weather_id(weather)
+ static_dict[weather_id] = static_dict.get(weather_id, 0) + 1
+ print(static_dict)
+ return
+
+if __name__ == '__main__':
+
+ path = ''
+ static_weather(path)
\ No newline at end of file
diff --git a/mmcv/fileio/__init__.py b/mmcv/fileio/__init__.py
new file mode 100644
index 0000000..b08824c
--- /dev/null
+++ b/mmcv/fileio/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# from .file_client import BaseStorageBackend, FileClient
+# from .io import dump, load, imread #register_handler
+# from .handlers import PickleHandler, JsonHandler
+# from .parse import *
\ No newline at end of file
diff --git a/mmcv/fileio/file_client.py b/mmcv/fileio/file_client.py
new file mode 100644
index 0000000..705eb65
--- /dev/null
+++ b/mmcv/fileio/file_client.py
@@ -0,0 +1,1146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os
+import os.path as osp
+import re
+import tempfile
+import warnings
+from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Iterable, Iterator, Optional, Tuple, Union
+from urllib.request import urlopen
+from mmcv.utils.misc import has_method
+from mmcv.utils.path import is_filepath, mkdir_or_exist
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+ """Abstract class of storage backends.
+
+ All backends need to implement two apis: ``get()`` and ``get_text()``.
+ ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
+ as texts.
+ """
+
+ # a flag to indicate whether the backend can create a symlink for a file
+ _allow_symlink = False
+
+ @property
+ def name(self):
+ return self.__class__.__name__
+
+ @property
+ def allow_symlink(self):
+ return self._allow_symlink
+
+ @abstractmethod
+ def get(self, filepath):
+ pass
+
+ @abstractmethod
+ def get_text(self, filepath):
+ pass
+
+
+class CephBackend(BaseStorageBackend):
+ """Ceph storage backend (for internal use).
+
+ Args:
+ path_mapping (dict|None): path mapping dict from local path to Petrel
+ path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath``
+ will be replaced by ``dst``. Default: None.
+
+ .. warning::
+ :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+ please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+ """
+
+ def __init__(self, path_mapping=None):
+ try:
+ import ceph
+ except ImportError:
+ raise ImportError('Please install ceph to enable CephBackend.')
+
+ warnings.warn(
+ 'CephBackend will be deprecated, please use PetrelBackend instead')
+ self._client = ceph.S3Client()
+ assert isinstance(path_mapping, dict) or path_mapping is None
+ self.path_mapping = path_mapping
+
+ def get(self, filepath):
+ filepath = str(filepath)
+ if self.path_mapping is not None:
+ for k, v in self.path_mapping.items():
+ filepath = filepath.replace(k, v)
+ value = self._client.Get(filepath)
+ value_buf = memoryview(value)
+ return value_buf
+
+ def get_text(self, filepath, encoding=None):
+ raise NotImplementedError
+
+
+class PetrelBackend(BaseStorageBackend):
+ """Petrel storage backend (for internal use).
+
+ PetrelBackend supports reading and writing data to multiple clusters.
+ If the file path contains the cluster name, PetrelBackend will read data
+ from specified cluster or write data to it. Otherwise, PetrelBackend will
+ access the default cluster.
+
+ Args:
+ path_mapping (dict, optional): Path mapping dict from local path to
+ Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+ ``filepath`` will be replaced by ``dst``. Default: None.
+ enable_mc (bool, optional): Whether to enable memcached support.
+ Default: True.
+
+ Examples:
+ >>> filepath1 = 's3://path/of/file'
+ >>> filepath2 = 'cluster-name:s3://path/of/file'
+ >>> client = PetrelBackend()
+ >>> client.get(filepath1) # get data from default cluster
+ >>> client.get(filepath2) # get data from 'cluster-name' cluster
+ """
+
+ def __init__(self,
+ path_mapping: Optional[dict] = None,
+ enable_mc: bool = True):
+ try:
+ from petrel_client import client
+ except ImportError:
+ raise ImportError('Please install petrel_client to enable '
+ 'PetrelBackend.')
+
+ self._client = client.Client(enable_mc=enable_mc)
+ assert isinstance(path_mapping, dict) or path_mapping is None
+ self.path_mapping = path_mapping
+
+ def _map_path(self, filepath: Union[str, Path]) -> str:
+ """Map ``filepath`` to a string path whose prefix will be replaced by
+ :attr:`self.path_mapping`.
+
+ Args:
+ filepath (str): Path to be mapped.
+ """
+ filepath = str(filepath)
+ if self.path_mapping is not None:
+ for k, v in self.path_mapping.items():
+ filepath = filepath.replace(k, v)
+ return filepath
+
+ def _format_path(self, filepath: str) -> str:
+ """Convert a ``filepath`` to standard format of petrel oss.
+
+ If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+ environment, the ``filepath`` will be the format of
+ 's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+ above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+ Args:
+ filepath (str): Path to be formatted.
+ """
+ return re.sub(r'\\+', '/', filepath)
+
+ def get(self, filepath: Union[str, Path]) -> memoryview:
+ """Read data from a given ``filepath`` with 'rb' mode.
+
+ Args:
+ filepath (str or Path): Path to read data.
+
+ Returns:
+ memoryview: A memory view of expected bytes object to avoid
+ copying. The memoryview object can be converted to bytes by
+ ``value_buf.tobytes()``.
+ """
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ value = self._client.Get(filepath)
+ value_buf = memoryview(value)
+ return value_buf
+
+ def get_text(self,
+ filepath: Union[str, Path],
+ encoding: str = 'utf-8') -> str:
+ """Read data from a given ``filepath`` with 'r' mode.
+
+ Args:
+ filepath (str or Path): Path to read data.
+ encoding (str): The encoding format used to open the ``filepath``.
+ Default: 'utf-8'.
+
+ Returns:
+ str: Expected text reading from ``filepath``.
+ """
+ return str(self.get(filepath), encoding=encoding)
+
+ def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+ """Save data to a given ``filepath``.
+
+ Args:
+ obj (bytes): Data to be saved.
+ filepath (str or Path): Path to write data.
+ """
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ self._client.put(filepath, obj)
+
+ def put_text(self,
+ obj: str,
+ filepath: Union[str, Path],
+ encoding: str = 'utf-8') -> None:
+ """Save data to a given ``filepath``.
+
+ Args:
+ obj (str): Data to be written.
+ filepath (str or Path): Path to write data.
+ encoding (str): The encoding format used to encode the ``obj``.
+ Default: 'utf-8'.
+ """
+ self.put(bytes(obj, encoding=encoding), filepath)
+
+ def remove(self, filepath: Union[str, Path]) -> None:
+ """Remove a file.
+
+ Args:
+ filepath (str or Path): Path to be removed.
+ """
+ if not has_method(self._client, 'delete'):
+ raise NotImplementedError(
+ ('Current version of Petrel Python SDK has not supported '
+ 'the `delete` method, please use a higher version or dev'
+ ' branch instead.'))
+
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ self._client.delete(filepath)
+
+ def exists(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path exists.
+
+ Args:
+ filepath (str or Path): Path to be checked whether exists.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+ """
+ if not (has_method(self._client, 'contains')
+ and has_method(self._client, 'isdir')):
+ raise NotImplementedError(
+ ('Current version of Petrel Python SDK has not supported '
+ 'the `contains` and `isdir` methods, please use a higher'
+ 'version or dev branch instead.'))
+
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ return self._client.contains(filepath) or self._client.isdir(filepath)
+
+ def isdir(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path is a directory.
+
+ Args:
+ filepath (str or Path): Path to be checked whether it is a
+ directory.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` points to a directory,
+ ``False`` otherwise.
+ """
+ if not has_method(self._client, 'isdir'):
+ raise NotImplementedError(
+ ('Current version of Petrel Python SDK has not supported '
+ 'the `isdir` method, please use a higher version or dev'
+ ' branch instead.'))
+
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ return self._client.isdir(filepath)
+
+ def isfile(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path is a file.
+
+ Args:
+ filepath (str or Path): Path to be checked whether it is a file.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` points to a file, ``False``
+ otherwise.
+ """
+ if not has_method(self._client, 'contains'):
+ raise NotImplementedError(
+ ('Current version of Petrel Python SDK has not supported '
+ 'the `contains` method, please use a higher version or '
+ 'dev branch instead.'))
+
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ return self._client.contains(filepath)
+
+ def join_path(self, filepath: Union[str, Path],
+ *filepaths: Union[str, Path]) -> str:
+ """Concatenate all file paths.
+
+ Args:
+ filepath (str or Path): Path to be concatenated.
+
+ Returns:
+ str: The result after concatenation.
+ """
+ filepath = self._format_path(self._map_path(filepath))
+ if filepath.endswith('/'):
+ filepath = filepath[:-1]
+ formatted_paths = [filepath]
+ for path in filepaths:
+ formatted_paths.append(self._format_path(self._map_path(path)))
+ return '/'.join(formatted_paths)
+
+ @contextmanager
+ def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+ """Download a file from ``filepath`` and return a temporary path.
+
+ ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+ can be called with ``with`` statement, and when exists from the
+ ``with`` statement, the temporary path will be released.
+
+ Args:
+ filepath (str | Path): Download a file from ``filepath``.
+
+ Examples:
+ >>> client = PetrelBackend()
+ >>> # After existing from the ``with`` clause,
+ >>> # the path will be removed
+ >>> with client.get_local_path('s3://path/of/your/file') as path:
+ ... # do something here
+
+ Yields:
+ Iterable[str]: Only yield one temporary path.
+ """
+ filepath = self._map_path(filepath)
+ filepath = self._format_path(filepath)
+ assert self.isfile(filepath)
+ try:
+ f = tempfile.NamedTemporaryFile(delete=False)
+ f.write(self.get(filepath))
+ f.close()
+ yield f.name
+ finally:
+ os.remove(f.name)
+
+ def list_dir_or_file(self,
+ dir_path: Union[str, Path],
+ list_dir: bool = True,
+ list_file: bool = True,
+ suffix: Optional[Union[str, Tuple[str]]] = None,
+ recursive: bool = False) -> Iterator[str]:
+ """Scan a directory to find the interested directories or files in
+ arbitrary order.
+
+ Note:
+ Petrel has no concept of directories but it simulates the directory
+ hierarchy in the filesystem through public prefixes. In addition,
+ if the returned path ends with '/', it means the path is a public
+ prefix which is a logical directory.
+
+ Note:
+ :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+ In addition, the returned path of directory will not contains the
+ suffix '/' which is consistent with other backends.
+
+ Args:
+ dir_path (str | Path): Path of the directory.
+ list_dir (bool): List the directories. Default: True.
+ list_file (bool): List the path of files. Default: True.
+ suffix (str or tuple[str], optional): File suffix
+ that we are interested in. Default: None.
+ recursive (bool): If set to True, recursively scan the
+ directory. Default: False.
+
+ Yields:
+ Iterable[str]: A relative path to ``dir_path``.
+ """
+ if not has_method(self._client, 'list'):
+ raise NotImplementedError(
+ ('Current version of Petrel Python SDK has not supported '
+ 'the `list` method, please use a higher version or dev'
+ ' branch instead.'))
+
+ dir_path = self._map_path(dir_path)
+ dir_path = self._format_path(dir_path)
+ if list_dir and suffix is not None:
+ raise TypeError(
+ '`list_dir` should be False when `suffix` is not None')
+
+ if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+ raise TypeError('`suffix` must be a string or tuple of strings')
+
+ # Petrel's simulated directory hierarchy assumes that directory paths
+ # should end with `/`
+ if not dir_path.endswith('/'):
+ dir_path += '/'
+
+ root = dir_path
+
+ def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+ recursive):
+ for path in self._client.list(dir_path):
+ # the `self.isdir` is not used here to determine whether path
+ # is a directory, because `self.isdir` relies on
+ # `self._client.list`
+ if path.endswith('/'): # a directory path
+ next_dir_path = self.join_path(dir_path, path)
+ if list_dir:
+ # get the relative path and exclude the last
+ # character '/'
+ rel_dir = next_dir_path[len(root):-1]
+ yield rel_dir
+ if recursive:
+ yield from _list_dir_or_file(next_dir_path, list_dir,
+ list_file, suffix,
+ recursive)
+ else: # a file path
+ absolute_path = self.join_path(dir_path, path)
+ rel_path = absolute_path[len(root):]
+ if (suffix is None
+ or rel_path.endswith(suffix)) and list_file:
+ yield rel_path
+
+ return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+ recursive)
+
+
+class MemcachedBackend(BaseStorageBackend):
+ """Memcached storage backend.
+
+ Attributes:
+ server_list_cfg (str): Config file for memcached server list.
+ client_cfg (str): Config file for memcached client.
+ sys_path (str | None): Additional path to be appended to `sys.path`.
+ Default: None.
+ """
+
+ def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+ if sys_path is not None:
+ import sys
+ sys.path.append(sys_path)
+ try:
+ import mc
+ except ImportError:
+ raise ImportError(
+ 'Please install memcached to enable MemcachedBackend.')
+
+ self.server_list_cfg = server_list_cfg
+ self.client_cfg = client_cfg
+ self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+ self.client_cfg)
+ # mc.pyvector servers as a point which points to a memory cache
+ self._mc_buffer = mc.pyvector()
+
+ def get(self, filepath):
+ filepath = str(filepath)
+ import mc
+ self._client.Get(filepath, self._mc_buffer)
+ value_buf = mc.ConvertBuffer(self._mc_buffer)
+ return value_buf
+
+ def get_text(self, filepath, encoding=None):
+ raise NotImplementedError
+
+
+class LmdbBackend(BaseStorageBackend):
+ """Lmdb storage backend.
+
+ Args:
+ db_path (str): Lmdb database path.
+ readonly (bool, optional): Lmdb environment parameter. If True,
+ disallow any write operations. Default: True.
+ lock (bool, optional): Lmdb environment parameter. If False, when
+ concurrent access occurs, do not lock the database. Default: False.
+ readahead (bool, optional): Lmdb environment parameter. If False,
+ disable the OS filesystem readahead mechanism, which may improve
+ random read performance when a database is larger than RAM.
+ Default: False.
+
+ Attributes:
+ db_path (str): Lmdb database path.
+ """
+
+ def __init__(self,
+ db_path,
+ readonly=True,
+ lock=False,
+ readahead=False,
+ **kwargs):
+ try:
+ import lmdb
+ except ImportError:
+ raise ImportError('Please install lmdb to enable LmdbBackend.')
+
+ self.db_path = str(db_path)
+ self._client = lmdb.open(
+ self.db_path,
+ readonly=readonly,
+ lock=lock,
+ readahead=readahead,
+ **kwargs)
+
+ def get(self, filepath):
+ """Get values according to the filepath.
+
+ Args:
+ filepath (str | obj:`Path`): Here, filepath is the lmdb key.
+ """
+ filepath = str(filepath)
+ with self._client.begin(write=False) as txn:
+ value_buf = txn.get(filepath.encode('ascii'))
+ return value_buf
+
+ def get_text(self, filepath, encoding=None):
+ raise NotImplementedError
+
+
+class HardDiskBackend(BaseStorageBackend):
+ """Raw hard disks storage backend."""
+
+ _allow_symlink = True
+
+ def get(self, filepath: Union[str, Path]) -> bytes:
+ """Read data from a given ``filepath`` with 'rb' mode.
+
+ Args:
+ filepath (str or Path): Path to read data.
+
+ Returns:
+ bytes: Expected bytes object.
+ """
+ with open(filepath, 'rb') as f:
+ value_buf = f.read()
+ return value_buf
+
+ def get_text(self,
+ filepath: Union[str, Path],
+ encoding: str = 'utf-8') -> str:
+ """Read data from a given ``filepath`` with 'r' mode.
+
+ Args:
+ filepath (str or Path): Path to read data.
+ encoding (str): The encoding format used to open the ``filepath``.
+ Default: 'utf-8'.
+
+ Returns:
+ str: Expected text reading from ``filepath``.
+ """
+ with open(filepath, 'r', encoding=encoding) as f:
+ value_buf = f.read()
+ return value_buf
+
+ def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+ """Write data to a given ``filepath`` with 'wb' mode.
+
+ Note:
+ ``put`` will create a directory if the directory of ``filepath``
+ does not exist.
+
+ Args:
+ obj (bytes): Data to be written.
+ filepath (str or Path): Path to write data.
+ """
+ mkdir_or_exist(osp.dirname(filepath))
+ with open(filepath, 'wb') as f:
+ f.write(obj)
+
+ def put_text(self,
+ obj: str,
+ filepath: Union[str, Path],
+ encoding: str = 'utf-8') -> None:
+ """Write data to a given ``filepath`` with 'w' mode.
+
+ Note:
+ ``put_text`` will create a directory if the directory of
+ ``filepath`` does not exist.
+
+ Args:
+ obj (str): Data to be written.
+ filepath (str or Path): Path to write data.
+ encoding (str): The encoding format used to open the ``filepath``.
+ Default: 'utf-8'.
+ """
+ mkdir_or_exist(osp.dirname(filepath))
+ with open(filepath, 'w', encoding=encoding) as f:
+ f.write(obj)
+
+ def remove(self, filepath: Union[str, Path]) -> None:
+ """Remove a file.
+
+ Args:
+ filepath (str or Path): Path to be removed.
+ """
+ os.remove(filepath)
+
+ def exists(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path exists.
+
+ Args:
+ filepath (str or Path): Path to be checked whether exists.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+ """
+ return osp.exists(filepath)
+
+ def isdir(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path is a directory.
+
+ Args:
+ filepath (str or Path): Path to be checked whether it is a
+ directory.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` points to a directory,
+ ``False`` otherwise.
+ """
+ return osp.isdir(filepath)
+
+ def isfile(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path is a file.
+
+ Args:
+ filepath (str or Path): Path to be checked whether it is a file.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` points to a file, ``False``
+ otherwise.
+ """
+ return osp.isfile(filepath)
+
+ def join_path(self, filepath: Union[str, Path],
+ *filepaths: Union[str, Path]) -> str:
+ """Concatenate all file paths.
+
+ Join one or more filepath components intelligently. The return value
+ is the concatenation of filepath and any members of *filepaths.
+
+ Args:
+ filepath (str or Path): Path to be concatenated.
+
+ Returns:
+ str: The result of concatenation.
+ """
+ return osp.join(filepath, *filepaths)
+
+ @contextmanager
+ def get_local_path(
+ self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]:
+ """Only for unified API and do nothing."""
+ yield filepath
+
+ def list_dir_or_file(self,
+ dir_path: Union[str, Path],
+ list_dir: bool = True,
+ list_file: bool = True,
+ suffix: Optional[Union[str, Tuple[str]]] = None,
+ recursive: bool = False) -> Iterator[str]:
+ """Scan a directory to find the interested directories or files in
+ arbitrary order.
+
+ Note:
+ :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+ Args:
+ dir_path (str | Path): Path of the directory.
+ list_dir (bool): List the directories. Default: True.
+ list_file (bool): List the path of files. Default: True.
+ suffix (str or tuple[str], optional): File suffix
+ that we are interested in. Default: None.
+ recursive (bool): If set to True, recursively scan the
+ directory. Default: False.
+
+ Yields:
+ Iterable[str]: A relative path to ``dir_path``.
+ """
+ if list_dir and suffix is not None:
+ raise TypeError('`suffix` should be None when `list_dir` is True')
+
+ if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+ raise TypeError('`suffix` must be a string or tuple of strings')
+
+ root = dir_path
+
+ def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+ recursive):
+ for entry in os.scandir(dir_path):
+ if not entry.name.startswith('.') and entry.is_file():
+ rel_path = osp.relpath(entry.path, root)
+ if (suffix is None
+ or rel_path.endswith(suffix)) and list_file:
+ yield rel_path
+ elif osp.isdir(entry.path):
+ if list_dir:
+ rel_dir = osp.relpath(entry.path, root)
+ yield rel_dir
+ if recursive:
+ yield from _list_dir_or_file(entry.path, list_dir,
+ list_file, suffix,
+ recursive)
+
+ return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+ recursive)
+
+
+class HTTPBackend(BaseStorageBackend):
+ """HTTP and HTTPS storage bachend."""
+
+ def get(self, filepath):
+ value_buf = urlopen(filepath).read()
+ return value_buf
+
+ def get_text(self, filepath, encoding='utf-8'):
+ value_buf = urlopen(filepath).read()
+ return value_buf.decode(encoding)
+
+ @contextmanager
+ def get_local_path(self, filepath: str) -> Iterable[str]:
+ """Download a file from ``filepath``.
+
+ ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+ can be called with ``with`` statement, and when exists from the
+ ``with`` statement, the temporary path will be released.
+
+ Args:
+ filepath (str): Download a file from ``filepath``.
+
+ Examples:
+ >>> client = HTTPBackend()
+ >>> # After existing from the ``with`` clause,
+ >>> # the path will be removed
+ >>> with client.get_local_path('http://path/of/your/file') as path:
+ ... # do something here
+ """
+ try:
+ f = tempfile.NamedTemporaryFile(delete=False)
+ f.write(self.get(filepath))
+ f.close()
+ yield f.name
+ finally:
+ os.remove(f.name)
+
+
+class FileClient:
+ """A general file client to access files in different backends.
+
+ The client loads a file or text in a specified backend from its path
+ and returns it as a binary or text file. There are two ways to choose a
+ backend, the name of backend and the prefix of path. Although both of them
+ can be used to choose a storage backend, ``backend`` has a higher priority
+ that is if they are all set, the storage backend will be chosen by the
+ backend argument. If they are all `None`, the disk backend will be chosen.
+ Note that It can also register other backend accessor with a given name,
+ prefixes, and backend class. In addition, We use the singleton pattern to
+ avoid repeated object creation. If the arguments are the same, the same
+ object will be returned.
+
+ Args:
+ backend (str, optional): The storage backend type. Options are "disk",
+ "ceph", "memcached", "lmdb", "http" and "petrel". Default: None.
+ prefix (str, optional): The prefix of the registered storage backend.
+ Options are "s3", "http", "https". Default: None.
+
+ Examples:
+ >>> # only set backend
+ >>> file_client = FileClient(backend='petrel')
+ >>> # only set prefix
+ >>> file_client = FileClient(prefix='s3')
+ >>> # set both backend and prefix but use backend to choose client
+ >>> file_client = FileClient(backend='petrel', prefix='s3')
+ >>> # if the arguments are the same, the same object is returned
+ >>> file_client1 = FileClient(backend='petrel')
+ >>> file_client1 is file_client
+ True
+
+ Attributes:
+ client (:obj:`BaseStorageBackend`): The backend object.
+ """
+
+ _backends = {
+ 'disk': HardDiskBackend,
+ 'ceph': CephBackend,
+ 'memcached': MemcachedBackend,
+ 'lmdb': LmdbBackend,
+ 'petrel': PetrelBackend,
+ 'http': HTTPBackend,
+ }
+ # This collection is used to record the overridden backends, and when a
+ # backend appears in the collection, the singleton pattern is disabled for
+ # that backend, because if the singleton pattern is used, then the object
+ # returned will be the backend before overwriting
+ _overridden_backends = set()
+ _prefix_to_backends = {
+ 's3': PetrelBackend,
+ 'http': HTTPBackend,
+ 'https': HTTPBackend,
+ }
+ _overridden_prefixes = set()
+
+ _instances = {}
+
+ def __new__(cls, backend=None, prefix=None, **kwargs):
+ if backend is None and prefix is None:
+ backend = 'disk'
+ if backend is not None and backend not in cls._backends:
+ raise ValueError(
+ f'Backend {backend} is not supported. Currently supported ones'
+ f' are {list(cls._backends.keys())}')
+ if prefix is not None and prefix not in cls._prefix_to_backends:
+ raise ValueError(
+ f'prefix {prefix} is not supported. Currently supported ones '
+ f'are {list(cls._prefix_to_backends.keys())}')
+
+ # concatenate the arguments to a unique key for determining whether
+ # objects with the same arguments were created
+ arg_key = f'{backend}:{prefix}'
+ for key, value in kwargs.items():
+ arg_key += f':{key}:{value}'
+
+ # if a backend was overridden, it will create a new object
+ if (arg_key in cls._instances
+ and backend not in cls._overridden_backends
+ and prefix not in cls._overridden_prefixes):
+ _instance = cls._instances[arg_key]
+ else:
+ # create a new object and put it to _instance
+ _instance = super().__new__(cls)
+ if backend is not None:
+ _instance.client = cls._backends[backend](**kwargs)
+ else:
+ _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+ cls._instances[arg_key] = _instance
+
+ return _instance
+
+ @property
+ def name(self):
+ return self.client.name
+
+ @property
+ def allow_symlink(self):
+ return self.client.allow_symlink
+
+ @staticmethod
+ def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+ """Parse the prefix of a uri.
+
+ Args:
+ uri (str | Path): Uri to be parsed that contains the file prefix.
+
+ Examples:
+ >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+ 's3'
+
+ Returns:
+ str | None: Return the prefix of uri if the uri contains '://'
+ else ``None``.
+ """
+ assert is_filepath(uri)
+ uri = str(uri)
+ if '://' not in uri:
+ return None
+ else:
+ prefix, _ = uri.split('://')
+ # In the case of PetrelBackend, the prefix may contains the cluster
+ # name like clusterName:s3
+ if ':' in prefix:
+ _, prefix = prefix.split(':')
+ return prefix
+
+ @classmethod
+ def infer_client(cls,
+ file_client_args: Optional[dict] = None,
+ uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+ """Infer a suitable file client based on the URI and arguments.
+
+ Args:
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. Default: None.
+ uri (str | Path, optional): Uri to be parsed that contains the file
+ prefix. Default: None.
+
+ Examples:
+ >>> uri = 's3://path/of/your/file'
+ >>> file_client = FileClient.infer_client(uri=uri)
+ >>> file_client_args = {'backend': 'petrel'}
+ >>> file_client = FileClient.infer_client(file_client_args)
+
+ Returns:
+ FileClient: Instantiated FileClient object.
+ """
+ assert file_client_args is not None or uri is not None
+ if file_client_args is None:
+ file_prefix = cls.parse_uri_prefix(uri) # type: ignore
+ return cls(prefix=file_prefix)
+ else:
+ return cls(**file_client_args)
+
+ @classmethod
+ def _register_backend(cls, name, backend, force=False, prefixes=None):
+ if not isinstance(name, str):
+ raise TypeError('the backend name should be a string, '
+ f'but got {type(name)}')
+ if not inspect.isclass(backend):
+ raise TypeError(
+ f'backend should be a class but got {type(backend)}')
+ if not issubclass(backend, BaseStorageBackend):
+ raise TypeError(
+ f'backend {backend} is not a subclass of BaseStorageBackend')
+ if not force and name in cls._backends:
+ raise KeyError(
+ f'{name} is already registered as a storage backend, '
+ 'add "force=True" if you want to override it')
+
+ if name in cls._backends and force:
+ cls._overridden_backends.add(name)
+ cls._backends[name] = backend
+
+ if prefixes is not None:
+ if isinstance(prefixes, str):
+ prefixes = [prefixes]
+ else:
+ assert isinstance(prefixes, (list, tuple))
+ for prefix in prefixes:
+ if prefix not in cls._prefix_to_backends:
+ cls._prefix_to_backends[prefix] = backend
+ elif (prefix in cls._prefix_to_backends) and force:
+ cls._overridden_prefixes.add(prefix)
+ cls._prefix_to_backends[prefix] = backend
+ else:
+ raise KeyError(
+ f'{prefix} is already registered as a storage backend,'
+ ' add "force=True" if you want to override it')
+
+ @classmethod
+ def register_backend(cls, name, backend=None, force=False, prefixes=None):
+ """Register a backend to FileClient.
+
+ This method can be used as a normal class method or a decorator.
+
+ .. code-block:: python
+
+ class NewBackend(BaseStorageBackend):
+
+ def get(self, filepath):
+ return filepath
+
+ def get_text(self, filepath):
+ return filepath
+
+ FileClient.register_backend('new', NewBackend)
+
+ or
+
+ .. code-block:: python
+
+ @FileClient.register_backend('new')
+ class NewBackend(BaseStorageBackend):
+
+ def get(self, filepath):
+ return filepath
+
+ def get_text(self, filepath):
+ return filepath
+
+ Args:
+ name (str): The name of the registered backend.
+ backend (class, optional): The backend class to be registered,
+ which must be a subclass of :class:`BaseStorageBackend`.
+ When this method is used as a decorator, backend is None.
+ Defaults to None.
+ force (bool, optional): Whether to override the backend if the name
+ has already been registered. Defaults to False.
+ prefixes (str or list[str] or tuple[str], optional): The prefixes
+ of the registered storage backend. Default: None.
+ `New in version 1.3.15.`
+ """
+ if backend is not None:
+ cls._register_backend(
+ name, backend, force=force, prefixes=prefixes)
+ return
+
+ def _register(backend_cls):
+ cls._register_backend(
+ name, backend_cls, force=force, prefixes=prefixes)
+ return backend_cls
+
+ return _register
+
+ def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+ """Read data from a given ``filepath`` with 'rb' mode.
+
+ Note:
+ There are two types of return values for ``get``, one is ``bytes``
+ and the other is ``memoryview``. The advantage of using memoryview
+ is that you can avoid copying, and if you want to convert it to
+ ``bytes``, you can use ``.tobytes()``.
+
+ Args:
+ filepath (str or Path): Path to read data.
+
+ Returns:
+ bytes | memoryview: Expected bytes object or a memory view of the
+ bytes object.
+ """
+ return self.client.get(filepath)
+
+ def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+ """Read data from a given ``filepath`` with 'r' mode.
+
+ Args:
+ filepath (str or Path): Path to read data.
+ encoding (str): The encoding format used to open the ``filepath``.
+ Default: 'utf-8'.
+
+ Returns:
+ str: Expected text reading from ``filepath``.
+ """
+ return self.client.get_text(filepath, encoding)
+
+ def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+ """Write data to a given ``filepath`` with 'wb' mode.
+
+ Note:
+ ``put`` should create a directory if the directory of ``filepath``
+ does not exist.
+
+ Args:
+ obj (bytes): Data to be written.
+ filepath (str or Path): Path to write data.
+ """
+ self.client.put(obj, filepath)
+
+ def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+ """Write data to a given ``filepath`` with 'w' mode.
+
+ Note:
+ ``put_text`` should create a directory if the directory of
+ ``filepath`` does not exist.
+
+ Args:
+ obj (str): Data to be written.
+ filepath (str or Path): Path to write data.
+ encoding (str, optional): The encoding format used to open the
+ `filepath`. Default: 'utf-8'.
+ """
+ self.client.put_text(obj, filepath)
+
+ def remove(self, filepath: Union[str, Path]) -> None:
+ """Remove a file.
+
+ Args:
+ filepath (str, Path): Path to be removed.
+ """
+ self.client.remove(filepath)
+
+ def exists(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path exists.
+
+ Args:
+ filepath (str or Path): Path to be checked whether exists.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+ """
+ return self.client.exists(filepath)
+
+ def isdir(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path is a directory.
+
+ Args:
+ filepath (str or Path): Path to be checked whether it is a
+ directory.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` points to a directory,
+ ``False`` otherwise.
+ """
+ return self.client.isdir(filepath)
+
+ def isfile(self, filepath: Union[str, Path]) -> bool:
+ """Check whether a file path is a file.
+
+ Args:
+ filepath (str or Path): Path to be checked whether it is a file.
+
+ Returns:
+ bool: Return ``True`` if ``filepath`` points to a file, ``False``
+ otherwise.
+ """
+ return self.client.isfile(filepath)
+
+ def join_path(self, filepath: Union[str, Path],
+ *filepaths: Union[str, Path]) -> str:
+ """Concatenate all file paths.
+
+ Join one or more filepath components intelligently. The return value
+ is the concatenation of filepath and any members of *filepaths.
+
+ Args:
+ filepath (str or Path): Path to be concatenated.
+
+ Returns:
+ str: The result of concatenation.
+ """
+ return self.client.join_path(filepath, *filepaths)
+
+ @contextmanager
+ def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+ """Download data from ``filepath`` and write the data to local path.
+
+ ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+ can be called with ``with`` statement, and when exists from the
+ ``with`` statement, the temporary path will be released.
+
+ Note:
+ If the ``filepath`` is a local path, just return itself.
+
+ .. warning::
+ ``get_local_path`` is an experimental interface that may change in
+ the future.
+
+ Args:
+ filepath (str or Path): Path to be read data.
+
+ Examples:
+ >>> file_client = FileClient(prefix='s3')
+ >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+ ... # do something here
+
+ Yields:
+ Iterable[str]: Only yield one path.
+ """
+ with self.client.get_local_path(str(filepath)) as local_path:
+ yield local_path
+
+ def list_dir_or_file(self,
+ dir_path: Union[str, Path],
+ list_dir: bool = True,
+ list_file: bool = True,
+ suffix: Optional[Union[str, Tuple[str]]] = None,
+ recursive: bool = False) -> Iterator[str]:
+ """Scan a directory to find the interested directories or files in
+ arbitrary order.
+
+ Note:
+ :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+ Args:
+ dir_path (str | Path): Path of the directory.
+ list_dir (bool): List the directories. Default: True.
+ list_file (bool): List the path of files. Default: True.
+ suffix (str or tuple[str], optional): File suffix
+ that we are interested in. Default: None.
+ recursive (bool): If set to True, recursively scan the
+ directory. Default: False.
+
+ Yields:
+ Iterable[str]: A relative path to ``dir_path``.
+ """
+ yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+ suffix, recursive)
diff --git a/mmcv/fileio/handlers/__init__.py b/mmcv/fileio/handlers/__init__.py
new file mode 100644
index 0000000..4756674
--- /dev/null
+++ b/mmcv/fileio/handlers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
\ No newline at end of file
diff --git a/mmcv/fileio/handlers/base.py b/mmcv/fileio/handlers/base.py
new file mode 100644
index 0000000..288878b
--- /dev/null
+++ b/mmcv/fileio/handlers/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+ # `str_like` is a flag to indicate whether the type of file object is
+ # str-like object or bytes-like object. Pickle only processes bytes-like
+ # objects but json only processes str-like object. If it is str-like
+ # object, `StringIO` will be used to process the buffer.
+ str_like = True
+
+ @abstractmethod
+ def load_from_fileobj(self, file, **kwargs):
+ pass
+
+ @abstractmethod
+ def dump_to_fileobj(self, obj, file, **kwargs):
+ pass
+
+ @abstractmethod
+ def dump_to_str(self, obj, **kwargs):
+ pass
+
+ def load_from_path(self, filepath, mode='r', **kwargs):
+ with open(filepath, mode) as f:
+ return self.load_from_fileobj(f, **kwargs)
+
+ def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+ with open(filepath, mode) as f:
+ self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/mmcv/fileio/handlers/json_handler.py b/mmcv/fileio/handlers/json_handler.py
new file mode 100644
index 0000000..18d4f15
--- /dev/null
+++ b/mmcv/fileio/handlers/json_handler.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+ """Set default json values for non-serializable values.
+
+ It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+ It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+ etc.) into plain numbers of plain python built-in types.
+ """
+ if isinstance(obj, (set, range)):
+ return list(obj)
+ elif isinstance(obj, np.ndarray):
+ return obj.tolist()
+ elif isinstance(obj, np.generic):
+ return obj.item()
+ raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+ def load_from_fileobj(self, file):
+ return json.load(file)
+
+ def dump_to_fileobj(self, obj, file, **kwargs):
+ kwargs.setdefault('default', set_default)
+ json.dump(obj, file, **kwargs)
+
+ def dump_to_str(self, obj, **kwargs):
+ kwargs.setdefault('default', set_default)
+ return json.dumps(obj, **kwargs)
diff --git a/mmcv/fileio/handlers/pickle_handler.py b/mmcv/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000..b37c79b
--- /dev/null
+++ b/mmcv/fileio/handlers/pickle_handler.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+ str_like = False
+
+ def load_from_fileobj(self, file, **kwargs):
+ return pickle.load(file, **kwargs)
+
+ def load_from_path(self, filepath, **kwargs):
+ return super(PickleHandler, self).load_from_path(
+ filepath, mode='rb', **kwargs)
+
+ def dump_to_str(self, obj, **kwargs):
+ kwargs.setdefault('protocol', 2)
+ return pickle.dumps(obj, **kwargs)
+
+ def dump_to_fileobj(self, obj, file, **kwargs):
+ kwargs.setdefault('protocol', 2)
+ pickle.dump(obj, file, **kwargs)
+
+ def dump_to_path(self, obj, filepath, **kwargs):
+ super(PickleHandler, self).dump_to_path(
+ obj, filepath, mode='wb', **kwargs)
diff --git a/mmcv/fileio/io.py b/mmcv/fileio/io.py
new file mode 100644
index 0000000..6155a5d
--- /dev/null
+++ b/mmcv/fileio/io.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from io import BytesIO, StringIO
+from pathlib import Path
+
+from ..utils.misc import is_list_of, is_str
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler
+
+file_handlers = {
+ 'json': JsonHandler(),
+ # 'yaml': YamlHandler(),
+ # 'yml': YamlHandler(),
+ 'pickle': PickleHandler(),
+ 'pkl': PickleHandler()
+}
+
+
+def load(file, file_format=None, file_client_args=None, **kwargs):
+ """Load data from json/yaml/pickle files.
+
+ This method provides a unified api for loading data from serialized files.
+
+ Note:
+ In v1.3.16 and later, ``load`` supports loading data from serialized
+ files those can be storaged in different backends.
+
+ Args:
+ file (str or :obj:`Path` or file-like object): Filename or a file-like
+ object.
+ file_format (str, optional): If not specified, the file format will be
+ inferred from the file extension, otherwise use the specified one.
+ Currently supported formats include "json", "yaml/yml" and
+ "pickle/pkl".
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. See :class:`mmcv.fileio.FileClient` for details.
+ Default: None.
+
+ Examples:
+ >>> load('/path/of/your/file') # file is storaged in disk
+ >>> load('https://path/of/your/file') # file is storaged in Internet
+ >>> load('s3://path/of/your/file') # file is storaged in petrel
+
+ Returns:
+ The content from the file.
+ """
+ #TODO(JIAZI)
+ from .file_client import FileClient
+ if isinstance(file, Path):
+ file = str(file)
+ if file_format is None and is_str(file):
+ file_format = file.split('.')[-1]
+ if file_format not in file_handlers:
+ raise TypeError(f'Unsupported format: {file_format}')
+
+ handler = file_handlers[file_format]
+ if is_str(file):
+ file_client = FileClient.infer_client(file_client_args, file)
+ if handler.str_like:
+ with StringIO(file_client.get_text(file)) as f:
+ obj = handler.load_from_fileobj(f, **kwargs)
+ else:
+ with BytesIO(file_client.get(file)) as f:
+ obj = handler.load_from_fileobj(f, **kwargs)
+ elif hasattr(file, 'read'):
+ obj = handler.load_from_fileobj(file, **kwargs)
+ else:
+ raise TypeError('"file" must be a filepath str or a file-object')
+ return obj
+
+
+def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
+ """Dump data to json/yaml/pickle strings or files.
+
+ This method provides a unified api for dumping data as strings or to files,
+ and also supports custom arguments for each file format.
+
+ Note:
+ In v1.3.16 and later, ``dump`` supports dumping data as strings or to
+ files which is saved to different backends.
+
+ Args:
+ obj (any): The python object to be dumped.
+ file (str or :obj:`Path` or file-like object, optional): If not
+ specified, then the object is dumped to a str, otherwise to a file
+ specified by the filename or file-like object.
+ file_format (str, optional): Same as :func:`load`.
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. See :class:`mmcv.fileio.FileClient` for details.
+ Default: None.
+
+ Examples:
+ >>> dump('hello world', '/path/of/your/file') # disk
+ >>> dump('hello world', 's3://path/of/your/file') # ceph or petrel
+
+ Returns:
+ bool: True for success, False otherwise.
+ """
+ #TODO(JIAZI)
+ from .file_client import FileClient
+ if isinstance(file, Path):
+ file = str(file)
+ if file_format is None:
+ if is_str(file):
+ file_format = file.split('.')[-1]
+ elif file is None:
+ raise ValueError(
+ 'file_format must be specified since file is None')
+ if file_format not in file_handlers:
+ raise TypeError(f'Unsupported format: {file_format}')
+
+ handler = file_handlers[file_format]
+ if file is None:
+ return handler.dump_to_str(obj, **kwargs)
+ elif is_str(file):
+ file_client = FileClient.infer_client(file_client_args, file)
+ if handler.str_like:
+ with StringIO() as f:
+ handler.dump_to_fileobj(obj, f, **kwargs)
+ file_client.put_text(f.getvalue(), file)
+ else:
+ with BytesIO() as f:
+ handler.dump_to_fileobj(obj, f, **kwargs)
+ file_client.put(f.getvalue(), file)
+ elif hasattr(file, 'write'):
+ handler.dump_to_fileobj(obj, file, **kwargs)
+ else:
+ raise TypeError('"file" must be a filename str or a file-object')
+
+
+def _register_handler(handler, file_formats):
+ """Register a handler for some file extensions.
+
+ Args:
+ handler (:obj:`BaseFileHandler`): Handler to be registered.
+ file_formats (str or list[str]): File formats to be handled by this
+ handler.
+ """
+ if not isinstance(handler, BaseFileHandler):
+ raise TypeError(
+ f'handler must be a child of BaseFileHandler, not {type(handler)}')
+ if isinstance(file_formats, str):
+ file_formats = [file_formats]
+ if not is_list_of(file_formats, str):
+ raise TypeError('file_formats must be a str or a list of str')
+ for ext in file_formats:
+ file_handlers[ext] = handler
+
+
+def register_handler(file_formats, **kwargs):
+
+ def wrap(cls):
+ _register_handler(cls(**kwargs), file_formats)
+ return cls
+
+ return wrap
diff --git a/mmcv/fileio/parse.py b/mmcv/fileio/parse.py
new file mode 100644
index 0000000..f60f0d6
--- /dev/null
+++ b/mmcv/fileio/parse.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from io import StringIO
+
+from .file_client import FileClient
+
+
+def list_from_file(filename,
+ prefix='',
+ offset=0,
+ max_num=0,
+ encoding='utf-8',
+ file_client_args=None):
+ """Load a text file and parse the content as a list of strings.
+
+ Note:
+ In v1.3.16 and later, ``list_from_file`` supports loading a text file
+ which can be storaged in different backends and parsing the content as
+ a list for strings.
+
+ Args:
+ filename (str): Filename.
+ prefix (str): The prefix to be inserted to the beginning of each item.
+ offset (int): The offset of lines.
+ max_num (int): The maximum number of lines to be read,
+ zeros and negatives mean no limitation.
+ encoding (str): Encoding used to open the file. Default utf-8.
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. See :class:`mmcv.fileio.FileClient` for details.
+ Default: None.
+
+ Examples:
+ >>> list_from_file('/path/of/your/file') # disk
+ ['hello', 'world']
+ >>> list_from_file('s3://path/of/your/file') # ceph or petrel
+ ['hello', 'world']
+
+ Returns:
+ list[str]: A list of strings.
+ """
+ cnt = 0
+ item_list = []
+ file_client = FileClient.infer_client(file_client_args, filename)
+ with StringIO(file_client.get_text(filename, encoding)) as f:
+ for _ in range(offset):
+ f.readline()
+ for line in f:
+ if 0 < max_num <= cnt:
+ break
+ item_list.append(prefix + line.rstrip('\n\r'))
+ cnt += 1
+ return item_list
+
+
+def dict_from_file(filename,
+ key_type=str,
+ encoding='utf-8',
+ file_client_args=None):
+ """Load a text file and parse the content as a dict.
+
+ Each line of the text file will be two or more columns split by
+ whitespaces or tabs. The first column will be parsed as dict keys, and
+ the following columns will be parsed as dict values.
+
+ Note:
+ In v1.3.16 and later, ``dict_from_file`` supports loading a text file
+ which can be storaged in different backends and parsing the content as
+ a dict.
+
+ Args:
+ filename(str): Filename.
+ key_type(type): Type of the dict keys. str is user by default and
+ type conversion will be performed if specified.
+ encoding (str): Encoding used to open the file. Default utf-8.
+ file_client_args (dict, optional): Arguments to instantiate a
+ FileClient. See :class:`mmcv.fileio.FileClient` for details.
+ Default: None.
+
+ Examples:
+ >>> dict_from_file('/path/of/your/file') # disk
+ {'key1': 'value1', 'key2': 'value2'}
+ >>> dict_from_file('s3://path/of/your/file') # ceph or petrel
+ {'key1': 'value1', 'key2': 'value2'}
+
+ Returns:
+ dict: The parsed contents.
+ """
+ mapping = {}
+ file_client = FileClient.infer_client(file_client_args, filename)
+ with StringIO(file_client.get_text(filename, encoding)) as f:
+ for line in f:
+ items = line.rstrip('\n').split()
+ assert len(items) >= 2
+ key = key_type(items[0])
+ val = items[1:] if len(items) > 2 else items[1]
+ mapping[key] = val
+ return mapping
diff --git a/mmcv/image/__init__.py b/mmcv/image/__init__.py
new file mode 100644
index 0000000..5d7edb2
--- /dev/null
+++ b/mmcv/image/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+ impad_to_multiple, imrescale, imresize, imresize_like,
+ imresize_to_multiple, imrotate, imshear, imtranslate,
+ rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+ adjust_lighting, adjust_sharpness, auto_contrast,
+ clahe, imdenormalize, imequalize, iminvert,
+ imnormalize, imnormalize_, lut_transform, posterize,
+ solarize)
+from .misc import tensor2imgs
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+ gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+ rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+# __all__ = [
+# 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+# 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+# 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+# 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+# 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+# 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+# 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+# 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+# 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+# 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting'
+# ]
diff --git a/mmcv/image/colorspace.py b/mmcv/image/colorspace.py
new file mode 100644
index 0000000..8145339
--- /dev/null
+++ b/mmcv/image/colorspace.py
@@ -0,0 +1,306 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+
+def imconvert(img, src, dst):
+ """Convert an image from the src colorspace to dst colorspace.
+
+ Args:
+ img (ndarray): The input image.
+ src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+ dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+ Returns:
+ ndarray: The converted image.
+ """
+ code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+ out_img = cv2.cvtColor(img, code)
+ return out_img
+
+
+def bgr2gray(img, keepdim=False):
+ """Convert a BGR image to grayscale image.
+
+ Args:
+ img (ndarray): The input image.
+ keepdim (bool): If False (by default), then return the grayscale image
+ with 2 dims, otherwise 3 dims.
+
+ Returns:
+ ndarray: The converted grayscale image.
+ """
+ out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ if keepdim:
+ out_img = out_img[..., None]
+ return out_img
+
+
+def rgb2gray(img, keepdim=False):
+ """Convert a RGB image to grayscale image.
+
+ Args:
+ img (ndarray): The input image.
+ keepdim (bool): If False (by default), then return the grayscale image
+ with 2 dims, otherwise 3 dims.
+
+ Returns:
+ ndarray: The converted grayscale image.
+ """
+ out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+ if keepdim:
+ out_img = out_img[..., None]
+ return out_img
+
+
+def gray2bgr(img):
+ """Convert a grayscale image to BGR image.
+
+ Args:
+ img (ndarray): The input image.
+
+ Returns:
+ ndarray: The converted BGR image.
+ """
+ img = img[..., None] if img.ndim == 2 else img
+ out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+ return out_img
+
+
+def gray2rgb(img):
+ """Convert a grayscale image to RGB image.
+
+ Args:
+ img (ndarray): The input image.
+
+ Returns:
+ ndarray: The converted RGB image.
+ """
+ img = img[..., None] if img.ndim == 2 else img
+ out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+ return out_img
+
+
+def _convert_input_type_range(img):
+ """Convert the type and range of the input image.
+
+ It converts the input image to np.float32 type and range of [0, 1].
+ It is mainly used for pre-processing the input image in colorspace
+ conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ (ndarray): The converted image with type of np.float32 and range of
+ [0, 1].
+ """
+ img_type = img.dtype
+ img = img.astype(np.float32)
+ if img_type == np.float32:
+ pass
+ elif img_type == np.uint8:
+ img /= 255.
+ else:
+ raise TypeError('The img type should be np.float32 or np.uint8, '
+ f'but got {img_type}')
+ return img
+
+
+def _convert_output_type_range(img, dst_type):
+ """Convert the type and range of the image according to dst_type.
+
+ It converts the image to desired type and range. If `dst_type` is np.uint8,
+ images will be converted to np.uint8 type with range [0, 255]. If
+ `dst_type` is np.float32, it converts the image to np.float32 type with
+ range [0, 1].
+ It is mainly used for post-processing images in colorspace conversion
+ functions such as rgb2ycbcr and ycbcr2rgb.
+
+ Args:
+ img (ndarray): The image to be converted with np.float32 type and
+ range [0, 255].
+ dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+ converts the image to np.uint8 type with range [0, 255]. If
+ dst_type is np.float32, it converts the image to np.float32 type
+ with range [0, 1].
+
+ Returns:
+ (ndarray): The converted image with desired type and range.
+ """
+ if dst_type not in (np.uint8, np.float32):
+ raise TypeError('The dst_type should be np.float32 or np.uint8, '
+ f'but got {dst_type}')
+ if dst_type == np.uint8:
+ img = img.round()
+ else:
+ img /= 255.
+ return img.astype(dst_type)
+
+
+def rgb2ycbcr(img, y_only=False):
+ """Convert a RGB image to YCbCr image.
+
+ This function produces the same results as Matlab's `rgb2ycbcr` function.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+ y_only (bool): Whether to only return Y channel. Default: False.
+
+ Returns:
+ ndarray: The converted YCbCr image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img)
+ if y_only:
+ out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+ else:
+ out_img = np.matmul(
+ img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+ [24.966, 112.0, -18.214]]) + [16, 128, 128]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def bgr2ycbcr(img, y_only=False):
+ """Convert a BGR image to YCbCr image.
+
+ The bgr version of rgb2ycbcr.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+ y_only (bool): Whether to only return Y channel. Default: False.
+
+ Returns:
+ ndarray: The converted YCbCr image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img)
+ if y_only:
+ out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+ else:
+ out_img = np.matmul(
+ img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+ [65.481, -37.797, 112.0]]) + [16, 128, 128]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def ycbcr2rgb(img):
+ """Convert a YCbCr image to RGB image.
+
+ This function produces the same results as Matlab's ycbcr2rgb function.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ ndarray: The converted RGB image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img) * 255
+ out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+ [0, -0.00153632, 0.00791071],
+ [0.00625893, -0.00318811, 0]]) * 255.0 + [
+ -222.921, 135.576, -276.836
+ ]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def ycbcr2bgr(img):
+ """Convert a YCbCr image to BGR image.
+
+ The bgr version of ycbcr2rgb.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ ndarray: The converted BGR image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img) * 255
+ out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+ [0.00791071, -0.00153632, 0],
+ [0, -0.00318811, 0.00625893]]) * 255.0 + [
+ -276.836, 135.576, -222.921
+ ]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def convert_color_factory(src, dst):
+
+ code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+ def convert_color(img):
+ out_img = cv2.cvtColor(img, code)
+ return out_img
+
+ convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+ image.
+
+ Args:
+ img (ndarray or str): The input image.
+
+ Returns:
+ ndarray: The converted {dst.upper()} image.
+ """
+
+ return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/mmcv/image/geometric.py b/mmcv/image/geometric.py
new file mode 100644
index 0000000..cf97c20
--- /dev/null
+++ b/mmcv/image/geometric.py
@@ -0,0 +1,728 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+
+import cv2
+import numpy as np
+
+from ..utils import to_2tuple
+from .io import imread_backend
+
+try:
+ from PIL import Image
+except ImportError:
+ Image = None
+
+
+def _scale_size(size, scale):
+ """Rescale a size by a ratio.
+
+ Args:
+ size (tuple[int]): (w, h).
+ scale (float | tuple(float)): Scaling factor.
+
+ Returns:
+ tuple[int]: scaled size.
+ """
+ if isinstance(scale, (float, int)):
+ scale = (scale, scale)
+ w, h = size
+ return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+ 'nearest': cv2.INTER_NEAREST,
+ 'bilinear': cv2.INTER_LINEAR,
+ 'bicubic': cv2.INTER_CUBIC,
+ 'area': cv2.INTER_AREA,
+ 'lanczos': cv2.INTER_LANCZOS4
+}
+
+if Image is not None:
+ pillow_interp_codes = {
+ 'nearest': Image.NEAREST,
+ 'bilinear': Image.BILINEAR,
+ 'bicubic': Image.BICUBIC,
+ 'box': Image.BOX,
+ 'lanczos': Image.LANCZOS,
+ 'hamming': Image.HAMMING
+ }
+
+
+def imresize(img,
+ size,
+ return_scale=False,
+ interpolation='bilinear',
+ out=None,
+ backend=None):
+ """Resize image to a given size.
+
+ Args:
+ img (ndarray): The input image.
+ size (tuple[int]): Target size (w, h).
+ return_scale (bool): Whether to return `w_scale` and `h_scale`.
+ interpolation (str): Interpolation method, accepted values are
+ "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+ backend, "nearest", "bilinear" for 'pillow' backend.
+ out (ndarray): The output destination.
+ backend (str | None): The image resize backend type. Options are `cv2`,
+ `pillow`, `None`. If backend is None, the global imread_backend
+ specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+ Returns:
+ tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+ `resized_img`.
+ """
+ h, w = img.shape[:2]
+ if backend is None:
+ backend = imread_backend
+ if backend not in ['cv2', 'pillow']:
+ raise ValueError(f'backend: {backend} is not supported for resize.'
+ f"Supported backends are 'cv2', 'pillow'")
+
+ if backend == 'pillow':
+ assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+ pil_image = Image.fromarray(img)
+ pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+ resized_img = np.array(pil_image)
+ else:
+ resized_img = cv2.resize(
+ img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+ if not return_scale:
+ return resized_img
+ else:
+ w_scale = size[0] / w
+ h_scale = size[1] / h
+ return resized_img, w_scale, h_scale
+
+
+def imresize_to_multiple(img,
+ divisor,
+ size=None,
+ scale_factor=None,
+ keep_ratio=False,
+ return_scale=False,
+ interpolation='bilinear',
+ out=None,
+ backend=None):
+ """Resize image according to a given size or scale factor and then rounds
+ up the the resized or rescaled image size to the nearest value that can be
+ divided by the divisor.
+
+ Args:
+ img (ndarray): The input image.
+ divisor (int | tuple): Resized image size will be a multiple of
+ divisor. If divisor is a tuple, divisor should be
+ (w_divisor, h_divisor).
+ size (None | int | tuple[int]): Target size (w, h). Default: None.
+ scale_factor (None | float | tuple[float]): Multiplier for spatial
+ size. Should match input size if it is a tuple and the 2D style is
+ (w_scale_factor, h_scale_factor). Default: None.
+ keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+ image. Default: False.
+ return_scale (bool): Whether to return `w_scale` and `h_scale`.
+ interpolation (str): Interpolation method, accepted values are
+ "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+ backend, "nearest", "bilinear" for 'pillow' backend.
+ out (ndarray): The output destination.
+ backend (str | None): The image resize backend type. Options are `cv2`,
+ `pillow`, `None`. If backend is None, the global imread_backend
+ specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+ Returns:
+ tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+ `resized_img`.
+ """
+ h, w = img.shape[:2]
+ if size is not None and scale_factor is not None:
+ raise ValueError('only one of size or scale_factor should be defined')
+ elif size is None and scale_factor is None:
+ raise ValueError('one of size or scale_factor should be defined')
+ elif size is not None:
+ size = to_2tuple(size)
+ if keep_ratio:
+ size = rescale_size((w, h), size, return_scale=False)
+ else:
+ size = _scale_size((w, h), scale_factor)
+
+ divisor = to_2tuple(divisor)
+ size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
+ resized_img, w_scale, h_scale = imresize(
+ img,
+ size,
+ return_scale=True,
+ interpolation=interpolation,
+ out=out,
+ backend=backend)
+ if return_scale:
+ return resized_img, w_scale, h_scale
+ else:
+ return resized_img
+
+
+def imresize_like(img,
+ dst_img,
+ return_scale=False,
+ interpolation='bilinear',
+ backend=None):
+ """Resize image to the same size of a given image.
+
+ Args:
+ img (ndarray): The input image.
+ dst_img (ndarray): The target image.
+ return_scale (bool): Whether to return `w_scale` and `h_scale`.
+ interpolation (str): Same as :func:`resize`.
+ backend (str | None): Same as :func:`resize`.
+
+ Returns:
+ tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+ `resized_img`.
+ """
+ h, w = dst_img.shape[:2]
+ return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+ """Calculate the new size to be rescaled to.
+
+ Args:
+ old_size (tuple[int]): The old size (w, h) of image.
+ scale (float | tuple[int]): The scaling factor or maximum size.
+ If it is a float number, then the image will be rescaled by this
+ factor, else if it is a tuple of 2 integers, then the image will
+ be rescaled as large as possible within the scale.
+ return_scale (bool): Whether to return the scaling factor besides the
+ rescaled image size.
+
+ Returns:
+ tuple[int]: The new rescaled image size.
+ """
+ w, h = old_size
+ if isinstance(scale, (float, int)):
+ if scale <= 0:
+ raise ValueError(f'Invalid scale {scale}, must be positive.')
+ scale_factor = scale
+ elif isinstance(scale, tuple):
+ max_long_edge = max(scale)
+ max_short_edge = min(scale)
+ scale_factor = min(max_long_edge / max(h, w),
+ max_short_edge / min(h, w))
+ else:
+ raise TypeError(
+ f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+ new_size = _scale_size((w, h), scale_factor)
+
+ if return_scale:
+ return new_size, scale_factor
+ else:
+ return new_size
+
+
+def imrescale(img,
+ scale,
+ return_scale=False,
+ interpolation='bilinear',
+ backend=None):
+ """Resize image while keeping the aspect ratio.
+
+ Args:
+ img (ndarray): The input image.
+ scale (float | tuple[int]): The scaling factor or maximum size.
+ If it is a float number, then the image will be rescaled by this
+ factor, else if it is a tuple of 2 integers, then the image will
+ be rescaled as large as possible within the scale.
+ return_scale (bool): Whether to return the scaling factor besides the
+ rescaled image.
+ interpolation (str): Same as :func:`resize`.
+ backend (str | None): Same as :func:`resize`.
+
+ Returns:
+ ndarray: The rescaled image.
+ """
+ h, w = img.shape[:2]
+ new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+ rescaled_img = imresize(
+ img, new_size, interpolation=interpolation, backend=backend)
+ if return_scale:
+ return rescaled_img, scale_factor
+ else:
+ return rescaled_img
+
+
+def imflip(img, direction='horizontal'):
+ """Flip an image horizontally or vertically.
+
+ Args:
+ img (ndarray): Image to be flipped.
+ direction (str): The flip direction, either "horizontal" or
+ "vertical" or "diagonal".
+
+ Returns:
+ ndarray: The flipped image.
+ """
+ assert direction in ['horizontal', 'vertical', 'diagonal']
+ if direction == 'horizontal':
+ return np.flip(img, axis=1)
+ elif direction == 'vertical':
+ return np.flip(img, axis=0)
+ else:
+ return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img, direction='horizontal'):
+ """Inplace flip an image horizontally or vertically.
+
+ Args:
+ img (ndarray): Image to be flipped.
+ direction (str): The flip direction, either "horizontal" or
+ "vertical" or "diagonal".
+
+ Returns:
+ ndarray: The flipped image (inplace).
+ """
+ assert direction in ['horizontal', 'vertical', 'diagonal']
+ if direction == 'horizontal':
+ return cv2.flip(img, 1, img)
+ elif direction == 'vertical':
+ return cv2.flip(img, 0, img)
+ else:
+ return cv2.flip(img, -1, img)
+
+
+def imrotate(img,
+ angle,
+ center=None,
+ scale=1.0,
+ border_value=0,
+ interpolation='bilinear',
+ auto_bound=False):
+ """Rotate an image.
+
+ Args:
+ img (ndarray): Image to be rotated.
+ angle (float): Rotation angle in degrees, positive values mean
+ clockwise rotation.
+ center (tuple[float], optional): Center point (w, h) of the rotation in
+ the source image. If not specified, the center of the image will be
+ used.
+ scale (float): Isotropic scale factor.
+ border_value (int): Border value.
+ interpolation (str): Same as :func:`resize`.
+ auto_bound (bool): Whether to adjust the image size to cover the whole
+ rotated image.
+
+ Returns:
+ ndarray: The rotated image.
+ """
+ if center is not None and auto_bound:
+ raise ValueError('`auto_bound` conflicts with `center`')
+ h, w = img.shape[:2]
+ if center is None:
+ center = ((w - 1) * 0.5, (h - 1) * 0.5)
+ assert isinstance(center, tuple)
+
+ matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+ if auto_bound:
+ cos = np.abs(matrix[0, 0])
+ sin = np.abs(matrix[0, 1])
+ new_w = h * sin + w * cos
+ new_h = h * cos + w * sin
+ matrix[0, 2] += (new_w - w) * 0.5
+ matrix[1, 2] += (new_h - h) * 0.5
+ w = int(np.round(new_w))
+ h = int(np.round(new_h))
+ rotated = cv2.warpAffine(
+ img,
+ matrix, (w, h),
+ flags=cv2_interp_codes[interpolation],
+ borderValue=border_value)
+ return rotated
+
+
+def bbox_clip(bboxes, img_shape):
+ """Clip bboxes to fit the image shape.
+
+ Args:
+ bboxes (ndarray): Shape (..., 4*k)
+ img_shape (tuple[int]): (height, width) of the image.
+
+ Returns:
+ ndarray: Clipped bboxes.
+ """
+ assert bboxes.shape[-1] % 4 == 0
+ cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+ cmin[0::2] = img_shape[1] - 1
+ cmin[1::2] = img_shape[0] - 1
+ clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+ return clipped_bboxes
+
+
+def bbox_scaling(bboxes, scale, clip_shape=None):
+ """Scaling bboxes w.r.t the box center.
+
+ Args:
+ bboxes (ndarray): Shape(..., 4).
+ scale (float): Scaling factor.
+ clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+ boundary will be clipped according to the given shape (h, w).
+
+ Returns:
+ ndarray: Scaled bboxes.
+ """
+ if float(scale) == 1.0:
+ scaled_bboxes = bboxes.copy()
+ else:
+ w = bboxes[..., 2] - bboxes[..., 0] + 1
+ h = bboxes[..., 3] - bboxes[..., 1] + 1
+ dw = (w * (scale - 1)) * 0.5
+ dh = (h * (scale - 1)) * 0.5
+ scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+ if clip_shape is not None:
+ return bbox_clip(scaled_bboxes, clip_shape)
+ else:
+ return scaled_bboxes
+
+
+def imcrop(img, bboxes, scale=1.0, pad_fill=None):
+ """Crop image patches.
+
+ 3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+ Args:
+ img (ndarray): Image to be cropped.
+ bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+ scale (float, optional): Scale ratio of bboxes, the default value
+ 1.0 means no padding.
+ pad_fill (Number | list[Number]): Value to be filled for padding.
+ Default: None, which means no padding.
+
+ Returns:
+ list[ndarray] | ndarray: The cropped image patches.
+ """
+ chn = 1 if img.ndim == 2 else img.shape[2]
+ if pad_fill is not None:
+ if isinstance(pad_fill, (int, float)):
+ pad_fill = [pad_fill for _ in range(chn)]
+ assert len(pad_fill) == chn
+
+ _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+ scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+ clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+ patches = []
+ for i in range(clipped_bbox.shape[0]):
+ x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+ if pad_fill is None:
+ patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+ else:
+ _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+ if chn == 1:
+ patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
+ else:
+ patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
+ patch = np.array(
+ pad_fill, dtype=img.dtype) * np.ones(
+ patch_shape, dtype=img.dtype)
+ x_start = 0 if _x1 >= 0 else -_x1
+ y_start = 0 if _y1 >= 0 else -_y1
+ w = x2 - x1 + 1
+ h = y2 - y1 + 1
+ patch[y_start:y_start + h, x_start:x_start + w,
+ ...] = img[y1:y1 + h, x1:x1 + w, ...]
+ patches.append(patch)
+
+ if bboxes.ndim == 1:
+ return patches[0]
+ else:
+ return patches
+
+
+def impad(img,
+ *,
+ shape=None,
+ padding=None,
+ pad_val=0,
+ padding_mode='constant'):
+ """Pad the given image to a certain shape or pad on all sides with
+ specified padding mode and padding value.
+
+ Args:
+ img (ndarray): Image to be padded.
+ shape (tuple[int]): Expected padding shape (h, w). Default: None.
+ padding (int or tuple[int]): Padding on each border. If a single int is
+ provided this is used to pad all borders. If tuple of length 2 is
+ provided this is the padding on left/right and top/bottom
+ respectively. If a tuple of length 4 is provided this is the
+ padding for the left, top, right and bottom borders respectively.
+ Default: None. Note that `shape` and `padding` can not be both
+ set.
+ pad_val (Number | Sequence[Number]): Values to be filled in padding
+ areas when padding_mode is 'constant'. Default: 0.
+ padding_mode (str): Type of padding. Should be: constant, edge,
+ reflect or symmetric. Default: constant.
+
+ - constant: pads with a constant value, this value is specified
+ with pad_val.
+ - edge: pads with the last value at the edge of the image.
+ - reflect: pads with reflection of image without repeating the
+ last value on the edge. For example, padding [1, 2, 3, 4]
+ with 2 elements on both sides in reflect mode will result
+ in [3, 2, 1, 2, 3, 4, 3, 2].
+ - symmetric: pads with reflection of image repeating the last
+ value on the edge. For example, padding [1, 2, 3, 4] with
+ 2 elements on both sides in symmetric mode will result in
+ [2, 1, 1, 2, 3, 4, 4, 3]
+
+ Returns:
+ ndarray: The padded image.
+ """
+
+ assert (shape is not None) ^ (padding is not None)
+ if shape is not None:
+ padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
+
+ # check pad_val
+ if isinstance(pad_val, tuple):
+ assert len(pad_val) == img.shape[-1]
+ elif not isinstance(pad_val, numbers.Number):
+ raise TypeError('pad_val must be a int or a tuple. '
+ f'But received {type(pad_val)}')
+
+ # check padding
+ if isinstance(padding, tuple) and len(padding) in [2, 4]:
+ if len(padding) == 2:
+ padding = (padding[0], padding[1], padding[0], padding[1])
+ elif isinstance(padding, numbers.Number):
+ padding = (padding, padding, padding, padding)
+ else:
+ raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+ f'But received {padding}')
+
+ # check padding mode
+ assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+ border_type = {
+ 'constant': cv2.BORDER_CONSTANT,
+ 'edge': cv2.BORDER_REPLICATE,
+ 'reflect': cv2.BORDER_REFLECT_101,
+ 'symmetric': cv2.BORDER_REFLECT
+ }
+ img = cv2.copyMakeBorder(
+ img,
+ padding[1],
+ padding[3],
+ padding[0],
+ padding[2],
+ border_type[padding_mode],
+ value=pad_val)
+
+ return img
+
+
+def impad_to_multiple(img, divisor, pad_val=0):
+ """Pad an image to ensure each edge to be multiple to some number.
+
+ Args:
+ img (ndarray): Image to be padded.
+ divisor (int): Padded image edges will be multiple to divisor.
+ pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+ Returns:
+ ndarray: The padded image.
+ """
+ pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+ pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+ return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img, shape, pad_val=0):
+ """Randomly cut out a rectangle from the original img.
+
+ Args:
+ img (ndarray): Image to be cutout.
+ shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+ int, the value will be used for both h and w.
+ pad_val (int | float | tuple[int | float]): Values to be filled in the
+ cut area. Defaults to 0.
+
+ Returns:
+ ndarray: The cutout image.
+ """
+
+ channels = 1 if img.ndim == 2 else img.shape[2]
+ if isinstance(shape, int):
+ cut_h, cut_w = shape, shape
+ else:
+ assert isinstance(shape, tuple) and len(shape) == 2, \
+ f'shape must be a int or a tuple with length 2, but got type ' \
+ f'{type(shape)} instead.'
+ cut_h, cut_w = shape
+ if isinstance(pad_val, (int, float)):
+ pad_val = tuple([pad_val] * channels)
+ elif isinstance(pad_val, tuple):
+ assert len(pad_val) == channels, \
+ 'Expected the num of elements in tuple equals the channels' \
+ 'of input image. Found {} vs {}'.format(
+ len(pad_val), channels)
+ else:
+ raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+ img_h, img_w = img.shape[:2]
+ y0 = np.random.uniform(img_h)
+ x0 = np.random.uniform(img_w)
+
+ y1 = int(max(0, y0 - cut_h / 2.))
+ x1 = int(max(0, x0 - cut_w / 2.))
+ y2 = min(img_h, y1 + cut_h)
+ x2 = min(img_w, x1 + cut_w)
+
+ if img.ndim == 2:
+ patch_shape = (y2 - y1, x2 - x1)
+ else:
+ patch_shape = (y2 - y1, x2 - x1, channels)
+
+ img_cutout = img.copy()
+ patch = np.array(
+ pad_val, dtype=img.dtype) * np.ones(
+ patch_shape, dtype=img.dtype)
+ img_cutout[y1:y2, x1:x2, ...] = patch
+
+ return img_cutout
+
+
+def _get_shear_matrix(magnitude, direction='horizontal'):
+ """Generate the shear matrix for transformation.
+
+ Args:
+ magnitude (int | float): The magnitude used for shear.
+ direction (str): The flip direction, either "horizontal"
+ or "vertical".
+
+ Returns:
+ ndarray: The shear matrix with dtype float32.
+ """
+ if direction == 'horizontal':
+ shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+ elif direction == 'vertical':
+ shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+ return shear_matrix
+
+
+def imshear(img,
+ magnitude,
+ direction='horizontal',
+ border_value=0,
+ interpolation='bilinear'):
+ """Shear an image.
+
+ Args:
+ img (ndarray): Image to be sheared with format (h, w)
+ or (h, w, c).
+ magnitude (int | float): The magnitude used for shear.
+ direction (str): The flip direction, either "horizontal"
+ or "vertical".
+ border_value (int | tuple[int]): Value used in case of a
+ constant border.
+ interpolation (str): Same as :func:`resize`.
+
+ Returns:
+ ndarray: The sheared image.
+ """
+ assert direction in ['horizontal',
+ 'vertical'], f'Invalid direction: {direction}'
+ height, width = img.shape[:2]
+ if img.ndim == 2:
+ channels = 1
+ elif img.ndim == 3:
+ channels = img.shape[-1]
+ if isinstance(border_value, int):
+ border_value = tuple([border_value] * channels)
+ elif isinstance(border_value, tuple):
+ assert len(border_value) == channels, \
+ 'Expected the num of elements in tuple equals the channels' \
+ 'of input image. Found {} vs {}'.format(
+ len(border_value), channels)
+ else:
+ raise ValueError(
+ f'Invalid type {type(border_value)} for `border_value`')
+ shear_matrix = _get_shear_matrix(magnitude, direction)
+ sheared = cv2.warpAffine(
+ img,
+ shear_matrix,
+ (width, height),
+ # Note case when the number elements in `border_value`
+ # greater than 3 (e.g. shearing masks whose channels large
+ # than 3) will raise TypeError in `cv2.warpAffine`.
+ # Here simply slice the first 3 values in `border_value`.
+ borderValue=border_value[:3],
+ flags=cv2_interp_codes[interpolation])
+ return sheared
+
+
+def _get_translate_matrix(offset, direction='horizontal'):
+ """Generate the translate matrix.
+
+ Args:
+ offset (int | float): The offset used for translate.
+ direction (str): The translate direction, either
+ "horizontal" or "vertical".
+
+ Returns:
+ ndarray: The translate matrix with dtype float32.
+ """
+ if direction == 'horizontal':
+ translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+ elif direction == 'vertical':
+ translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+ return translate_matrix
+
+
+def imtranslate(img,
+ offset,
+ direction='horizontal',
+ border_value=0,
+ interpolation='bilinear'):
+ """Translate an image.
+
+ Args:
+ img (ndarray): Image to be translated with format
+ (h, w) or (h, w, c).
+ offset (int | float): The offset used for translate.
+ direction (str): The translate direction, either "horizontal"
+ or "vertical".
+ border_value (int | tuple[int]): Value used in case of a
+ constant border.
+ interpolation (str): Same as :func:`resize`.
+
+ Returns:
+ ndarray: The translated image.
+ """
+ assert direction in ['horizontal',
+ 'vertical'], f'Invalid direction: {direction}'
+ height, width = img.shape[:2]
+ if img.ndim == 2:
+ channels = 1
+ elif img.ndim == 3:
+ channels = img.shape[-1]
+ if isinstance(border_value, int):
+ border_value = tuple([border_value] * channels)
+ elif isinstance(border_value, tuple):
+ assert len(border_value) == channels, \
+ 'Expected the num of elements in tuple equals the channels' \
+ 'of input image. Found {} vs {}'.format(
+ len(border_value), channels)
+ else:
+ raise ValueError(
+ f'Invalid type {type(border_value)} for `border_value`.')
+ translate_matrix = _get_translate_matrix(offset, direction)
+ translated = cv2.warpAffine(
+ img,
+ translate_matrix,
+ (width, height),
+ # Note case when the number elements in `border_value`
+ # greater than 3 (e.g. translating masks whose channels
+ # large than 3) will raise TypeError in `cv2.warpAffine`.
+ # Here simply slice the first 3 values in `border_value`.
+ borderValue=border_value[:3],
+ flags=cv2_interp_codes[interpolation])
+ return translated
diff --git a/mmcv/image/io.py b/mmcv/image/io.py
new file mode 100644
index 0000000..69369f0
--- /dev/null
+++ b/mmcv/image/io.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os.path as osp
+from pathlib import Path
+
+import cv2
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+ IMREAD_UNCHANGED)
+
+from mmcv.utils import check_file_exist, is_str, mkdir_or_exist
+
+try:
+ from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+ TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+ from PIL import Image, ImageOps
+except ImportError:
+ Image = None
+
+try:
+ import tifffile
+except ImportError:
+ tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+ 'color': IMREAD_COLOR,
+ 'grayscale': IMREAD_GRAYSCALE,
+ 'unchanged': IMREAD_UNCHANGED,
+ 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+ 'grayscale_ignore_orientation':
+ IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend):
+ """Select a backend for image decoding.
+
+ Args:
+ backend (str): The image decoding backend type. Options are `cv2`,
+ `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+ and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+ file format.
+ """
+ assert backend in supported_backends
+ global imread_backend
+ imread_backend = backend
+ if imread_backend == 'turbojpeg':
+ if TurboJPEG is None:
+ raise ImportError('`PyTurboJPEG` is not installed')
+ global jpeg
+ if jpeg is None:
+ jpeg = TurboJPEG()
+ elif imread_backend == 'pillow':
+ if Image is None:
+ raise ImportError('`Pillow` is not installed')
+ elif imread_backend == 'tifffile':
+ if tifffile is None:
+ raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag='color', channel_order='bgr'):
+ channel_order = channel_order.lower()
+ if channel_order not in ['rgb', 'bgr']:
+ raise ValueError('channel order must be either "rgb" or "bgr"')
+
+ if flag == 'color':
+ if channel_order == 'bgr':
+ return TJPF_BGR
+ elif channel_order == 'rgb':
+ return TJCS_RGB
+ elif flag == 'grayscale':
+ return TJPF_GRAY
+ else:
+ raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img, flag='color', channel_order='bgr'):
+ """Convert a pillow image to numpy array.
+
+ Args:
+ img (:obj:`PIL.Image.Image`): The image loaded using PIL
+ flag (str): Flags specifying the color type of a loaded image,
+ candidates are 'color', 'grayscale' and 'unchanged'.
+ Default to 'color'.
+ channel_order (str): The channel order of the output image array,
+ candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+ Returns:
+ np.ndarray: The converted numpy array
+ """
+ channel_order = channel_order.lower()
+ if channel_order not in ['rgb', 'bgr']:
+ raise ValueError('channel order must be either "rgb" or "bgr"')
+
+ if flag == 'unchanged':
+ array = np.array(img)
+ if array.ndim >= 3 and array.shape[2] >= 3: # color image
+ array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR
+ else:
+ # Handle exif orientation tag
+ if flag in ['color', 'grayscale']:
+ img = ImageOps.exif_transpose(img)
+ # If the image mode is not 'RGB', convert it to 'RGB' first.
+ if img.mode != 'RGB':
+ if img.mode != 'LA':
+ # Most formats except 'LA' can be directly converted to RGB
+ img = img.convert('RGB')
+ else:
+ # When the mode is 'LA', the default conversion will fill in
+ # the canvas with black, which sometimes shadows black objects
+ # in the foreground.
+ #
+ # Therefore, a random color (124, 117, 104) is used for canvas
+ img_rgba = img.convert('RGBA')
+ img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+ img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha
+ if flag in ['color', 'color_ignore_orientation']:
+ array = np.array(img)
+ if channel_order != 'rgb':
+ array = array[:, :, ::-1] # RGB to BGR
+ elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+ img = img.convert('L')
+ array = np.array(img)
+ else:
+ raise ValueError(
+ 'flag must be "color", "grayscale", "unchanged", '
+ f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+ f' but got {flag}')
+ return array
+
+
+def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
+ """Read an image.
+
+ Args:
+ img_or_path (ndarray or str or Path): Either a numpy array or str or
+ pathlib.Path. If it is a numpy array (loaded image), then
+ it will be returned as is.
+ flag (str): Flags specifying the color type of a loaded image,
+ candidates are `color`, `grayscale`, `unchanged`,
+ `color_ignore_orientation` and `grayscale_ignore_orientation`.
+ By default, `cv2` and `pillow` backend would rotate the image
+ according to its EXIF info unless called with `unchanged` or
+ `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+ always ignore image's EXIF info regardless of the flag.
+ The `turbojpeg` backend only supports `color` and `grayscale`.
+ channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+ backend (str | None): The image decoding backend type. Options are
+ `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+ If backend is None, the global imread_backend specified by
+ ``mmcv.use_backend()`` will be used. Default: None.
+
+ Returns:
+ ndarray: Loaded image array.
+ """
+
+ if backend is None:
+ backend = imread_backend
+ if backend not in supported_backends:
+ raise ValueError(f'backend: {backend} is not supported. Supported '
+ "backends are 'cv2', 'turbojpeg', 'pillow'")
+ if isinstance(img_or_path, Path):
+ img_or_path = str(img_or_path)
+
+ if isinstance(img_or_path, np.ndarray):
+ return img_or_path
+ elif is_str(img_or_path):
+ check_file_exist(img_or_path,
+ f'img file does not exist: {img_or_path}')
+ if backend == 'turbojpeg':
+ with open(img_or_path, 'rb') as in_file:
+ img = jpeg.decode(in_file.read(),
+ _jpegflag(flag, channel_order))
+ if img.shape[-1] == 1:
+ img = img[:, :, 0]
+ return img
+ elif backend == 'pillow':
+ img = Image.open(img_or_path)
+ img = _pillow2array(img, flag, channel_order)
+ return img
+ elif backend == 'tifffile':
+ img = tifffile.imread(img_or_path)
+ return img
+ else:
+ flag = imread_flags[flag] if is_str(flag) else flag
+ img = cv2.imread(img_or_path, flag)
+ if flag == IMREAD_COLOR and channel_order == 'rgb':
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+ return img
+ else:
+ raise TypeError('"img" must be a numpy array or a str or '
+ 'a pathlib.Path object')
+
+
+def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
+ """Read an image from bytes.
+
+ Args:
+ content (bytes): Image bytes got from files or other streams.
+ flag (str): Same as :func:`imread`.
+ backend (str | None): The image decoding backend type. Options are
+ `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
+ global imread_backend specified by ``mmcv.use_backend()`` will be
+ used. Default: None.
+
+ Returns:
+ ndarray: Loaded image array.
+ """
+
+ if backend is None:
+ backend = imread_backend
+ if backend not in supported_backends:
+ raise ValueError(f'backend: {backend} is not supported. Supported '
+ "backends are 'cv2', 'turbojpeg', 'pillow'")
+ if backend == 'turbojpeg':
+ img = jpeg.decode(content, _jpegflag(flag, channel_order))
+ if img.shape[-1] == 1:
+ img = img[:, :, 0]
+ return img
+ elif backend == 'pillow':
+ buff = io.BytesIO(content)
+ img = Image.open(buff)
+ img = _pillow2array(img, flag, channel_order)
+ return img
+ else:
+ img_np = np.frombuffer(content, np.uint8)
+ flag = imread_flags[flag] if is_str(flag) else flag
+ img = cv2.imdecode(img_np, flag)
+ if flag == IMREAD_COLOR and channel_order == 'rgb':
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+ return img
+
+
+def imwrite(img, file_path, params=None, auto_mkdir=True):
+ """Write image to file.
+
+ Args:
+ img (ndarray): Image array to be written.
+ file_path (str): Image file path.
+ params (None or list): Same as opencv :func:`imwrite` interface.
+ auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+ whether to create it automatically.
+
+ Returns:
+ bool: Successful or not.
+ """
+ if auto_mkdir:
+ dir_name = osp.abspath(osp.dirname(file_path))
+ mkdir_or_exist(dir_name)
+ return cv2.imwrite(file_path, img, params)
+
+
+
+
diff --git a/mmcv/image/misc.py b/mmcv/image/misc.py
new file mode 100644
index 0000000..a52304a
--- /dev/null
+++ b/mmcv/image/misc.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.image import imdenormalize
+
+try:
+ import torch
+except ImportError:
+ torch = None
+
+
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+ """Convert tensor to 3-channel images.
+
+ Args:
+ tensor (torch.Tensor): Tensor that contains multiple images, shape (
+ N, C, H, W).
+ mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
+ std (tuple[float], optional): Standard deviation of images.
+ Defaults to (1, 1, 1).
+ to_rgb (bool, optional): Whether the tensor was converted to RGB
+ format in the first place. If so, convert it back to BGR.
+ Defaults to True.
+
+ Returns:
+ list[np.ndarray]: A list that contains multiple images.
+ """
+
+ if torch is None:
+ raise RuntimeError('pytorch is not installed')
+ assert torch.is_tensor(tensor) and tensor.ndim == 4
+ assert len(mean) == 3
+ assert len(std) == 3
+
+ num_imgs = tensor.size(0)
+ mean = np.array(mean, dtype=np.float32)
+ std = np.array(std, dtype=np.float32)
+ imgs = []
+ for img_id in range(num_imgs):
+ img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+ img = imdenormalize(
+ img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+ imgs.append(np.ascontiguousarray(img))
+ return imgs
diff --git a/mmcv/image/photometric.py b/mmcv/image/photometric.py
new file mode 100644
index 0000000..5085d01
--- /dev/null
+++ b/mmcv/image/photometric.py
@@ -0,0 +1,428 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from ..utils import is_tuple_of
+from .colorspace import bgr2gray, gray2bgr
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+ """Normalize an image with mean and std.
+
+ Args:
+ img (ndarray): Image to be normalized.
+ mean (ndarray): The mean to be used for normalize.
+ std (ndarray): The std to be used for normalize.
+ to_rgb (bool): Whether to convert to rgb.
+
+ Returns:
+ ndarray: The normalized image.
+ """
+ img = img.copy().astype(np.float32)
+ return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+ """Inplace normalize an image with mean and std.
+
+ Args:
+ img (ndarray): Image to be normalized.
+ mean (ndarray): The mean to be used for normalize.
+ std (ndarray): The std to be used for normalize.
+ to_rgb (bool): Whether to convert to rgb.
+
+ Returns:
+ ndarray: The normalized image.
+ """
+ # cv2 inplace normalization does not accept uint8
+ assert img.dtype != np.uint8
+ mean = np.float64(mean.reshape(1, -1))
+ stdinv = 1 / np.float64(std.reshape(1, -1))
+ if to_rgb:
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
+ cv2.subtract(img, mean, img) # inplace
+ cv2.multiply(img, stdinv, img) # inplace
+ return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+ assert img.dtype != np.uint8
+ mean = mean.reshape(1, -1).astype(np.float64)
+ std = std.reshape(1, -1).astype(np.float64)
+ img = cv2.multiply(img, std) # make a copy
+ cv2.add(img, mean, img) # inplace
+ if to_bgr:
+ cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace
+ return img
+
+
+def iminvert(img):
+ """Invert (negate) an image.
+
+ Args:
+ img (ndarray): Image to be inverted.
+
+ Returns:
+ ndarray: The inverted image.
+ """
+ return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+ """Solarize an image (invert all pixel values above a threshold)
+
+ Args:
+ img (ndarray): Image to be solarized.
+ thr (int): Threshold for solarizing (0 - 255).
+
+ Returns:
+ ndarray: The solarized image.
+ """
+ img = np.where(img < thr, img, 255 - img)
+ return img
+
+
+def posterize(img, bits):
+ """Posterize an image (reduce the number of bits for each color channel)
+
+ Args:
+ img (ndarray): Image to be posterized.
+ bits (int): Number of bits (1 to 8) to use for posterizing.
+
+ Returns:
+ ndarray: The posterized image.
+ """
+ shift = 8 - bits
+ img = np.left_shift(np.right_shift(img, shift), shift)
+ return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0):
+ r"""It blends the source image and its gray image:
+
+ .. math::
+ output = img * alpha + gray\_img * beta + gamma
+
+ Args:
+ img (ndarray): The input source image.
+ alpha (int | float): Weight for the source image. Default 1.
+ beta (int | float): Weight for the converted gray image.
+ If None, it's assigned the value (1 - `alpha`).
+ gamma (int | float): Scalar added to each sum.
+ Same as :func:`cv2.addWeighted`. Default 0.
+
+ Returns:
+ ndarray: Colored image which has the same size and dtype as input.
+ """
+ gray_img = bgr2gray(img)
+ gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+ if beta is None:
+ beta = 1 - alpha
+ colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+ if not colored_img.dtype == np.uint8:
+ # Note when the dtype of `img` is not the default `np.uint8`
+ # (e.g. np.float32), the value in `colored_img` got from cv2
+ # is not guaranteed to be in range [0, 255], so here clip
+ # is needed.
+ colored_img = np.clip(colored_img, 0, 255)
+ return colored_img
+
+
+def imequalize(img):
+ """Equalize the image histogram.
+
+ This function applies a non-linear mapping to the input image,
+ in order to create a uniform distribution of grayscale values
+ in the output image.
+
+ Args:
+ img (ndarray): Image to be equalized.
+
+ Returns:
+ ndarray: The equalized image.
+ """
+
+ def _scale_channel(im, c):
+ """Scale the data in the corresponding channel."""
+ im = im[:, :, c]
+ # Compute the histogram of the image channel.
+ histo = np.histogram(im, 256, (0, 255))[0]
+ # For computing the step, filter out the nonzeros.
+ nonzero_histo = histo[histo > 0]
+ step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+ if not step:
+ lut = np.array(range(256))
+ else:
+ # Compute the cumulative sum, shifted by step // 2
+ # and then normalized by step.
+ lut = (np.cumsum(histo) + (step // 2)) // step
+ # Shift lut, prepending with 0.
+ lut = np.concatenate([[0], lut[:-1]], 0)
+ # handle potential integer overflow
+ lut[lut > 255] = 255
+ # If step is zero, return the original image.
+ # Otherwise, index from lut.
+ return np.where(np.equal(step, 0), im, lut[im])
+
+ # Scales each channel independently and then stacks
+ # the result.
+ s1 = _scale_channel(img, 0)
+ s2 = _scale_channel(img, 1)
+ s3 = _scale_channel(img, 2)
+ equalized_img = np.stack([s1, s2, s3], axis=-1)
+ return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1.):
+ """Adjust image brightness.
+
+ This function controls the brightness of an image. An
+ enhancement factor of 0.0 gives a black image.
+ A factor of 1.0 gives the original image. This function
+ blends the source image and the degenerated black image:
+
+ .. math::
+ output = img * factor + degenerated * (1 - factor)
+
+ Args:
+ img (ndarray): Image to be brightened.
+ factor (float): A value controls the enhancement.
+ Factor 1.0 returns the original image, lower
+ factors mean less color (brightness, contrast,
+ etc), and higher values more. Default 1.
+
+ Returns:
+ ndarray: The brightened image.
+ """
+ degenerated = np.zeros_like(img)
+ # Note manually convert the dtype to np.float32, to
+ # achieve as close results as PIL.ImageEnhance.Brightness.
+ # Set beta=1-factor, and gamma=0
+ brightened_img = cv2.addWeighted(
+ img.astype(np.float32), factor, degenerated.astype(np.float32),
+ 1 - factor, 0)
+ brightened_img = np.clip(brightened_img, 0, 255)
+ return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1.):
+ """Adjust image contrast.
+
+ This function controls the contrast of an image. An
+ enhancement factor of 0.0 gives a solid grey
+ image. A factor of 1.0 gives the original image. It
+ blends the source image and the degenerated mean image:
+
+ .. math::
+ output = img * factor + degenerated * (1 - factor)
+
+ Args:
+ img (ndarray): Image to be contrasted. BGR order.
+ factor (float): Same as :func:`mmcv.adjust_brightness`.
+
+ Returns:
+ ndarray: The contrasted image.
+ """
+ gray_img = bgr2gray(img)
+ hist = np.histogram(gray_img, 256, (0, 255))[0]
+ mean = round(np.sum(gray_img) / np.sum(hist))
+ degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+ degenerated = gray2bgr(degenerated)
+ contrasted_img = cv2.addWeighted(
+ img.astype(np.float32), factor, degenerated.astype(np.float32),
+ 1 - factor, 0)
+ contrasted_img = np.clip(contrasted_img, 0, 255)
+ return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+ """Auto adjust image contrast.
+
+ This function maximize (normalize) image contrast by first removing cutoff
+ percent of the lightest and darkest pixels from the histogram and remapping
+ the image so that the darkest pixel becomes black (0), and the lightest
+ becomes white (255).
+
+ Args:
+ img (ndarray): Image to be contrasted. BGR order.
+ cutoff (int | float | tuple): The cutoff percent of the lightest and
+ darkest pixels to be removed. If given as tuple, it shall be
+ (low, high). Otherwise, the single value will be used for both.
+ Defaults to 0.
+
+ Returns:
+ ndarray: The contrasted image.
+ """
+
+ def _auto_contrast_channel(im, c, cutoff):
+ im = im[:, :, c]
+ # Compute the histogram of the image channel.
+ histo = np.histogram(im, 256, (0, 255))[0]
+ # Remove cut-off percent pixels from histo
+ histo_sum = np.cumsum(histo)
+ cut_low = histo_sum[-1] * cutoff[0] // 100
+ cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+ histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+ histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+ # Compute mapping
+ low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+ # If all the values have been cut off, return the origin img
+ if low >= high:
+ return im
+ scale = 255.0 / (high - low)
+ offset = -low * scale
+ lut = np.array(range(256))
+ lut = lut * scale + offset
+ lut = np.clip(lut, 0, 255)
+ return lut[im]
+
+ if isinstance(cutoff, (int, float)):
+ cutoff = (cutoff, cutoff)
+ else:
+ assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+ f'float or tuple, but got {type(cutoff)} instead.'
+ # Auto adjusts contrast for each channel independently and then stacks
+ # the result.
+ s1 = _auto_contrast_channel(img, 0, cutoff)
+ s2 = _auto_contrast_channel(img, 1, cutoff)
+ s3 = _auto_contrast_channel(img, 2, cutoff)
+ contrasted_img = np.stack([s1, s2, s3], axis=-1)
+ return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+ """Adjust image sharpness.
+
+ This function controls the sharpness of an image. An
+ enhancement factor of 0.0 gives a blurred image. A
+ factor of 1.0 gives the original image. And a factor
+ of 2.0 gives a sharpened image. It blends the source
+ image and the degenerated mean image:
+
+ .. math::
+ output = img * factor + degenerated * (1 - factor)
+
+ Args:
+ img (ndarray): Image to be sharpened. BGR order.
+ factor (float): Same as :func:`mmcv.adjust_brightness`.
+ kernel (np.ndarray, optional): Filter kernel to be applied on the img
+ to obtain the degenerated img. Defaults to None.
+
+ Note:
+ No value sanity check is enforced on the kernel set by users. So with
+ an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+ the function its name indicates but end up performing whatever
+ transform determined by the kernel.
+
+ Returns:
+ ndarray: The sharpened image.
+ """
+
+ if kernel is None:
+ # adopted from PIL.ImageFilter.SMOOTH
+ kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+ assert isinstance(kernel, np.ndarray), \
+ f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+ assert kernel.ndim == 2, \
+ f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+ degenerated = cv2.filter2D(img, -1, kernel)
+ sharpened_img = cv2.addWeighted(
+ img.astype(np.float32), factor, degenerated.astype(np.float32),
+ 1 - factor, 0)
+ sharpened_img = np.clip(sharpened_img, 0, 255)
+ return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+ """AlexNet-style PCA jitter.
+
+ This data augmentation is proposed in `ImageNet Classification with Deep
+ Convolutional Neural Networks
+ `_.
+
+ Args:
+ img (ndarray): Image to be adjusted lighting. BGR order.
+ eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+ values, respectively.
+ eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+ values, respectively.
+ alphastd (float): The standard deviation for distribution of alpha.
+ Defaults to 0.1
+ to_rgb (bool): Whether to convert img to rgb.
+
+ Returns:
+ ndarray: The adjusted image.
+ """
+ assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+ f'eigval and eigvec should both be of type np.ndarray, got ' \
+ f'{type(eigval)} and {type(eigvec)} instead.'
+
+ assert eigval.ndim == 1 and eigvec.ndim == 2
+ assert eigvec.shape == (3, eigval.shape[0])
+ n_eigval = eigval.shape[0]
+ assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+ f'got {type(alphastd)} instead.'
+
+ img = img.copy().astype(np.float32)
+ if to_rgb:
+ cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
+
+ alpha = np.random.normal(0, alphastd, n_eigval)
+ alter = eigvec \
+ * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+ * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+ alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+ img_adjusted = img + alter
+ return img_adjusted
+
+
+def lut_transform(img, lut_table):
+ """Transform array by look-up table.
+
+ The function lut_transform fills the output array with values from the
+ look-up table. Indices of the entries are taken from the input array.
+
+ Args:
+ img (ndarray): Image to be transformed.
+ lut_table (ndarray): look-up table of 256 elements; in case of
+ multi-channel input array, the table should either have a single
+ channel (in this case the same table is used for all channels) or
+ the same number of channels as in the input array.
+
+ Returns:
+ ndarray: The transformed image.
+ """
+ assert isinstance(img, np.ndarray)
+ assert 0 <= np.min(img) and np.max(img) <= 255
+ assert isinstance(lut_table, np.ndarray)
+ assert lut_table.shape == (256, )
+
+ return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+ """Use CLAHE method to process the image.
+
+ See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+ Graphics Gems, 1994:474-485.` for more information.
+
+ Args:
+ img (ndarray): Image to be processed.
+ clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+ tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+ Input image will be divided into equally sized rectangular tiles.
+ It defines the number of tiles in row and column. Default: (8, 8).
+
+ Returns:
+ ndarray: The processed image.
+ """
+ assert isinstance(img, np.ndarray)
+ assert img.ndim == 2
+ assert isinstance(clip_limit, (float, int))
+ assert is_tuple_of(tile_grid_size, int)
+ assert len(tile_grid_size) == 2
+
+ clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+ return clahe.apply(np.array(img, dtype=np.uint8))
diff --git a/mmcv/layers/__init__.py b/mmcv/layers/__init__.py
new file mode 100644
index 0000000..53f735c
--- /dev/null
+++ b/mmcv/layers/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .batch_norm import get_norm
+from .nms import batched_nms
+from .shape_spec import ShapeSpec
+from .wrappers import cat, Conv2d
+from .roi_align import ROIAlign
\ No newline at end of file
diff --git a/mmcv/layers/aspp.py b/mmcv/layers/aspp.py
new file mode 100644
index 0000000..14861aa
--- /dev/null
+++ b/mmcv/layers/aspp.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from copy import deepcopy
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .batch_norm import get_norm
+from .blocks import DepthwiseSeparableConv2d
+from .wrappers import Conv2d
+
+
+class ASPP(nn.Module):
+ """
+ Atrous Spatial Pyramid Pooling (ASPP).
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ dilations,
+ *,
+ norm,
+ activation,
+ pool_kernel_size=None,
+ dropout: float = 0.0,
+ use_depthwise_separable_conv=False,
+ ):
+ """
+ Args:
+ in_channels (int): number of input channels for ASPP.
+ out_channels (int): number of output channels.
+ dilations (list): a list of 3 dilations in ASPP.
+ norm (str or callable): normalization for all conv layers.
+ See :func:`layers.get_norm` for supported format. norm is
+ applied to all conv layers except the conv following
+ global average pooling.
+ activation (callable): activation function.
+ pool_kernel_size (tuple, list): the average pooling size (kh, kw)
+ for image pooling layer in ASPP. If set to None, it always
+ performs global average pooling. If not None, it must be
+ divisible by the shape of inputs in forward(). It is recommended
+ to use a fixed input feature size in training, and set this
+ option to match this size, so that it performs global average
+ pooling in training, and the size of the pooling window stays
+ consistent in inference.
+ dropout (float): apply dropout on the output of ASPP. It is used in
+ the official DeepLab implementation with a rate of 0.1:
+ https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532 # noqa
+ use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
+ for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
+ """
+ super(ASPP, self).__init__()
+ assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
+ self.pool_kernel_size = pool_kernel_size
+ self.dropout = dropout
+ use_bias = norm == ""
+ self.convs = nn.ModuleList()
+ # conv 1x1
+ self.convs.append(
+ Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ bias=use_bias,
+ norm=get_norm(norm, out_channels),
+ activation=deepcopy(activation),
+ )
+ )
+ weight_init.c2_xavier_fill(self.convs[-1])
+ # atrous convs
+ for dilation in dilations:
+ if use_depthwise_separable_conv:
+ self.convs.append(
+ DepthwiseSeparableConv2d(
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ padding=dilation,
+ dilation=dilation,
+ norm1=norm,
+ activation1=deepcopy(activation),
+ norm2=norm,
+ activation2=deepcopy(activation),
+ )
+ )
+ else:
+ self.convs.append(
+ Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ padding=dilation,
+ dilation=dilation,
+ bias=use_bias,
+ norm=get_norm(norm, out_channels),
+ activation=deepcopy(activation),
+ )
+ )
+ weight_init.c2_xavier_fill(self.convs[-1])
+ # image pooling
+ # We do not add BatchNorm because the spatial resolution is 1x1,
+ # the original TF implementation has BatchNorm.
+ if pool_kernel_size is None:
+ image_pooling = nn.Sequential(
+ nn.AdaptiveAvgPool2d(1),
+ Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+ )
+ else:
+ image_pooling = nn.Sequential(
+ nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
+ Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+ )
+ weight_init.c2_xavier_fill(image_pooling[1])
+ self.convs.append(image_pooling)
+
+ self.project = Conv2d(
+ 5 * out_channels,
+ out_channels,
+ kernel_size=1,
+ bias=use_bias,
+ norm=get_norm(norm, out_channels),
+ activation=deepcopy(activation),
+ )
+ weight_init.c2_xavier_fill(self.project)
+
+ def forward(self, x):
+ size = x.shape[-2:]
+ if self.pool_kernel_size is not None:
+ if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
+ raise ValueError(
+ "`pool_kernel_size` must be divisible by the shape of inputs. "
+ "Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
+ )
+ res = []
+ for conv in self.convs:
+ res.append(conv(x))
+ res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
+ res = torch.cat(res, dim=1)
+ res = self.project(res)
+ res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
+ return res
diff --git a/mmcv/layers/batch_norm.py b/mmcv/layers/batch_norm.py
new file mode 100644
index 0000000..9c9d19f
--- /dev/null
+++ b/mmcv/layers/batch_norm.py
@@ -0,0 +1,384 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd.function import Function
+from .wrappers import BatchNorm2d
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+def get_world_size() -> int:
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size()
+
+class _AllReduce(Function):
+ @staticmethod
+ def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+ input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
+ # Use allgather instead of allreduce since I don't trust in-place operations ..
+ dist.all_gather(input_list, input, async_op=False)
+ inputs = torch.stack(input_list, dim=0)
+ return torch.sum(inputs, dim=0)
+
+ @staticmethod
+ def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+ dist.all_reduce(grad_output, async_op=False)
+ return grad_output
+
+def differentiable_all_reduce(input: torch.Tensor) -> torch.Tensor:
+ """
+ Differentiable counterpart of `dist.all_reduce`.
+ """
+ if (
+ not dist.is_available()
+ or not dist.is_initialized()
+ or dist.get_world_size() == 1
+ ):
+ return input
+ return _AllReduce.apply(input)
+
+
+class FrozenBatchNorm2d(nn.Module):
+ """
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+ It contains non-trainable buffers called
+ "weight" and "bias", "running_mean", "running_var",
+ initialized to perform identity transformation.
+
+ The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+ which are computed from the original four parameters of BN.
+ The affine transform `x * weight + bias` will perform the equivalent
+ computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+ When loading a backbone model from Caffe2, "running_mean" and "running_var"
+ will be left unchanged as identity transformation.
+
+ Other pre-trained backbone models may contain all 4 parameters.
+
+ The forward is implemented by `F.batch_norm(..., training=False)`.
+ """
+
+ _version = 3
+
+ def __init__(self, num_features, eps=1e-5):
+ super().__init__()
+ self.num_features = num_features
+ self.eps = eps
+ self.register_buffer("weight", torch.ones(num_features))
+ self.register_buffer("bias", torch.zeros(num_features))
+ self.register_buffer("running_mean", torch.zeros(num_features))
+ self.register_buffer("running_var", torch.ones(num_features) - eps)
+ self.register_buffer("num_batches_tracked", None)
+
+ def forward(self, x):
+ if x.requires_grad:
+ # When gradients are needed, F.batch_norm will use extra memory
+ # because its backward op computes gradients for weight/bias as well.
+ scale = self.weight * (self.running_var + self.eps).rsqrt()
+ bias = self.bias - self.running_mean * scale
+ scale = scale.reshape(1, -1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1)
+ out_dtype = x.dtype # may be half
+ return x * scale.to(out_dtype) + bias.to(out_dtype)
+ else:
+ # When gradients are not needed, F.batch_norm is a single fused op
+ # and provide more optimization opportunities.
+ return F.batch_norm(
+ x,
+ self.running_mean,
+ self.running_var,
+ self.weight,
+ self.bias,
+ training=False,
+ eps=self.eps,
+ )
+
+ def _load_from_state_dict(
+ self,
+ state_dict,
+ prefix,
+ local_metadata,
+ strict,
+ missing_keys,
+ unexpected_keys,
+ error_msgs,
+ ):
+ version = local_metadata.get("version", None)
+
+ if version is None or version < 2:
+ # No running_mean/var in early versions
+ # This will silent the warnings
+ if prefix + "running_mean" not in state_dict:
+ state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+ if prefix + "running_var" not in state_dict:
+ state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+ super()._load_from_state_dict(
+ state_dict,
+ prefix,
+ local_metadata,
+ strict,
+ missing_keys,
+ unexpected_keys,
+ error_msgs,
+ )
+
+ def __repr__(self):
+ return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+ @classmethod
+ def convert_frozen_batchnorm(cls, module):
+ """
+ Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+ Args:
+ module (torch.nn.Module):
+
+ Returns:
+ If module is BatchNorm/SyncBatchNorm, returns a new module.
+ Otherwise, in-place convert module and return it.
+
+ Similar to convert_sync_batchnorm in
+ https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+ """
+ bn_module = nn.modules.batchnorm
+ bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+ res = module
+ if isinstance(module, bn_module):
+ res = cls(module.num_features)
+ if module.affine:
+ res.weight.data = module.weight.data.clone().detach()
+ res.bias.data = module.bias.data.clone().detach()
+ res.running_mean.data = module.running_mean.data
+ res.running_var.data = module.running_var.data
+ res.eps = module.eps
+ res.num_batches_tracked = module.num_batches_tracked
+ else:
+ for name, child in module.named_children():
+ new_child = cls.convert_frozen_batchnorm(child)
+ if new_child is not child:
+ res.add_module(name, new_child)
+ return res
+
+ @classmethod
+ def convert_frozenbatchnorm2d_to_batchnorm2d(cls, module: nn.Module) -> nn.Module:
+ """
+ Convert all FrozenBatchNorm2d to BatchNorm2d
+
+ Args:
+ module (torch.nn.Module):
+
+ Returns:
+ If module is FrozenBatchNorm2d, returns a new module.
+ Otherwise, in-place convert module and return it.
+
+ This is needed for quantization:
+ https://fb.workplace.com/groups/1043663463248667/permalink/1296330057982005/
+ """
+
+ res = module
+ if isinstance(module, FrozenBatchNorm2d):
+ res = torch.nn.BatchNorm2d(module.num_features, module.eps)
+
+ res.weight.data = module.weight.data.clone().detach()
+ res.bias.data = module.bias.data.clone().detach()
+ res.running_mean.data = module.running_mean.data.clone().detach()
+ res.running_var.data = module.running_var.data.clone().detach()
+ res.eps = module.eps
+ res.num_batches_tracked = module.num_batches_tracked
+ else:
+ for name, child in module.named_children():
+ new_child = cls.convert_frozenbatchnorm2d_to_batchnorm2d(child)
+ if new_child is not child:
+ res.add_module(name, new_child)
+ return res
+
+
+def get_norm(norm, out_channels):
+ """
+ Args:
+ norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+ or a callable that takes a channel number and returns
+ the normalization layer as a nn.Module.
+
+ Returns:
+ nn.Module or None: the normalization layer
+ """
+ if norm is None:
+ return None
+ if isinstance(norm, str):
+ if len(norm) == 0:
+ return None
+ norm = {
+ "BN": BatchNorm2d,
+ # Fixed in https://github.com/pytorch/pytorch/pull/36382
+ "SyncBN": NaiveSyncBatchNorm if TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+ "FrozenBN": FrozenBatchNorm2d,
+ "GN": lambda channels: nn.GroupNorm(32, channels),
+ # for debugging:
+ "nnSyncBN": nn.SyncBatchNorm,
+ "naiveSyncBN": NaiveSyncBatchNorm,
+ # expose stats_mode N as an option to caller, required for zero-len inputs
+ "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
+ "LN": lambda channels: LayerNorm(channels),
+ }[norm]
+ return norm(out_channels)
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+ """
+ In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+ when the batch size on each worker is different.
+ (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+ This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+ Note:
+ There isn't a single definition of Sync BatchNorm.
+
+ When ``stats_mode==""``, this module computes overall statistics by using
+ statistics of each worker with equal weight. The result is true statistics
+ of all samples (as if they are all on one worker) only when all workers
+ have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+ When ``stats_mode=="N"``, this module computes overall statistics by weighting
+ the statistics of each worker by their ``N``. The result is true statistics
+ of all samples (as if they are all on one worker) only when all workers
+ have the same (H, W). It is slower than ``stats_mode==""``.
+
+ Even though the result of this module may not be the true statistics of all samples,
+ it may still be reasonable because it might be preferrable to assign equal weights
+ to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+ on larger images. From preliminary experiments, little difference is found between such
+ a simplified implementation and an accurate computation of overall mean & variance.
+ """
+
+ def __init__(self, *args, stats_mode="", **kwargs):
+ super().__init__(*args, **kwargs)
+ assert stats_mode in ["", "N"]
+ self._stats_mode = stats_mode
+
+ def forward(self, input):
+ if get_world_size() == 1 or not self.training:
+ return super().forward(input)
+
+ B, C = input.shape[0], input.shape[1]
+
+ half_input = input.dtype == torch.float16
+ if half_input:
+ # fp16 does not have good enough numerics for the reduction here
+ input = input.float()
+ mean = torch.mean(input, dim=[0, 2, 3])
+ meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+ if self._stats_mode == "":
+ assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+ vec = torch.cat([mean, meansqr], dim=0)
+ vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+ mean, meansqr = torch.split(vec, C)
+ momentum = self.momentum
+ else:
+ if B == 0:
+ vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+ vec = vec + input.sum() # make sure there is gradient w.r.t input
+ else:
+ vec = torch.cat(
+ [
+ mean,
+ meansqr,
+ torch.ones([1], device=mean.device, dtype=mean.dtype),
+ ],
+ dim=0,
+ )
+ vec = differentiable_all_reduce(vec * B)
+
+ total_batch = vec[-1].detach()
+ momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0
+ mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C) # avoid div-by-zero
+
+ var = meansqr - mean * mean
+ invstd = torch.rsqrt(var + self.eps)
+ scale = self.weight * invstd
+ bias = self.bias - mean * scale
+ scale = scale.reshape(1, -1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1)
+
+ self.running_mean += momentum * (mean.detach() - self.running_mean)
+ self.running_var += momentum * (var.detach() - self.running_var)
+ ret = input * scale + bias
+ if half_input:
+ ret = ret.half()
+ return ret
+
+
+class CycleBatchNormList(nn.ModuleList):
+ """
+ Implement domain-specific BatchNorm by cycling.
+
+ When a BatchNorm layer is used for multiple input domains or input
+ features, it might need to maintain a separate test-time statistics
+ for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
+
+ This module implements it by using N separate BN layers
+ and it cycles through them every time a forward() is called.
+
+ NOTE: The caller of this module MUST guarantee to always call
+ this module by multiple of N times. Otherwise its test-time statistics
+ will be incorrect.
+ """
+
+ def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
+ """
+ Args:
+ length: number of BatchNorm layers to cycle.
+ bn_class: the BatchNorm class to use
+ kwargs: arguments of the BatchNorm class, such as num_features.
+ """
+ self._affine = kwargs.pop("affine", True)
+ super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
+ if self._affine:
+ # shared affine, domain-specific BN
+ channels = self[0].num_features
+ self.weight = nn.Parameter(torch.ones(channels))
+ self.bias = nn.Parameter(torch.zeros(channels))
+ self._pos = 0
+
+ def forward(self, x):
+ ret = self[self._pos](x)
+ self._pos = (self._pos + 1) % len(self)
+
+ if self._affine:
+ w = self.weight.reshape(1, -1, 1, 1)
+ b = self.bias.reshape(1, -1, 1, 1)
+ return ret * w + b
+ else:
+ return ret
+
+ def extra_repr(self):
+ return f"affine={self._affine}"
+
+
+class LayerNorm(nn.Module):
+ """
+ A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+ variance normalization over the channel dimension for inputs that have shape
+ (batch_size, channels, height, width).
+ https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa B950
+ """
+
+ def __init__(self, normalized_shape, eps=1e-6):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(normalized_shape))
+ self.bias = nn.Parameter(torch.zeros(normalized_shape))
+ self.eps = eps
+ self.normalized_shape = (normalized_shape,)
+
+ def forward(self, x):
+ u = x.mean(1, keepdim=True)
+ s = (x - u).pow(2).mean(1, keepdim=True)
+ x = (x - u) / torch.sqrt(s + self.eps)
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
+ return x
diff --git a/mmcv/layers/blocks.py b/mmcv/layers/blocks.py
new file mode 100644
index 0000000..1995a4b
--- /dev/null
+++ b/mmcv/layers/blocks.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+
+from .batch_norm import FrozenBatchNorm2d, get_norm
+from .wrappers import Conv2d
+
+
+"""
+CNN building blocks.
+"""
+
+
+class CNNBlockBase(nn.Module):
+ """
+ A CNN block is assumed to have input channels, output channels and a stride.
+ The input and output of `forward()` method must be NCHW tensors.
+ The method can perform arbitrary computation but must match the given
+ channels and stride specification.
+
+ Attribute:
+ in_channels (int):
+ out_channels (int):
+ stride (int):
+ """
+
+ def __init__(self, in_channels, out_channels, stride):
+ """
+ The `__init__` method of any subclass should also contain these arguments.
+
+ Args:
+ in_channels (int):
+ out_channels (int):
+ stride (int):
+ """
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.stride = stride
+
+ def freeze(self):
+ """
+ Make this block not trainable.
+ This method sets all parameters to `requires_grad=False`,
+ and convert all BatchNorm layers to FrozenBatchNorm
+
+ Returns:
+ the block itself
+ """
+ for p in self.parameters():
+ p.requires_grad = False
+ FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+ return self
+
+
+class DepthwiseSeparableConv2d(nn.Module):
+ """
+ A kxk depthwise convolution + a 1x1 convolution.
+
+ In :paper:`xception`, norm & activation are applied on the second conv.
+ :paper:`mobilenet` uses norm & activation on both convs.
+ """
+
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ padding=1,
+ dilation=1,
+ *,
+ norm1=None,
+ activation1=None,
+ norm2=None,
+ activation2=None,
+ ):
+ """
+ Args:
+ norm1, norm2 (str or callable): normalization for the two conv layers.
+ activation1, activation2 (callable(Tensor) -> Tensor): activation
+ function for the two conv layers.
+ """
+ super().__init__()
+ self.depthwise = Conv2d(
+ in_channels,
+ in_channels,
+ kernel_size=kernel_size,
+ padding=padding,
+ dilation=dilation,
+ groups=in_channels,
+ bias=not norm1,
+ norm=get_norm(norm1, in_channels),
+ activation=activation1,
+ )
+ self.pointwise = Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ bias=not norm2,
+ norm=get_norm(norm2, out_channels),
+ activation=activation2,
+ )
+
+ # default initialization
+ weight_init.c2_msra_fill(self.depthwise)
+ weight_init.c2_msra_fill(self.pointwise)
+
+ def forward(self, x):
+ return self.pointwise(self.depthwise(x))
diff --git a/mmcv/layers/csrc/README.md b/mmcv/layers/csrc/README.md
new file mode 100644
index 0000000..778ed3d
--- /dev/null
+++ b/mmcv/layers/csrc/README.md
@@ -0,0 +1,7 @@
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
diff --git a/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
new file mode 100644
index 0000000..03f4211
--- /dev/null
+++ b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
@@ -0,0 +1,115 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor ROIAlignRotated_forward_cuda(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const double spatial_scale,
+ const int64_t pooled_height,
+ const int64_t pooled_width,
+ const int64_t sampling_ratio) {
+ if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+ return ROIAlignRotated_forward_cuda(
+ input,
+ rois,
+ spatial_scale,
+ pooled_height,
+ pooled_width,
+ sampling_ratio);
+#else
+ AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+ }
+ return ROIAlignRotated_forward_cpu(
+ input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const double spatial_scale,
+ const int64_t pooled_height,
+ const int64_t pooled_width,
+ const int64_t batch_size,
+ const int64_t channels,
+ const int64_t height,
+ const int64_t width,
+ const int64_t sampling_ratio) {
+ if (grad.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+ return ROIAlignRotated_backward_cuda(
+ grad,
+ rois,
+ spatial_scale,
+ pooled_height,
+ pooled_width,
+ batch_size,
+ channels,
+ height,
+ width,
+ sampling_ratio);
+#else
+ AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+ }
+ return ROIAlignRotated_backward_cpu(
+ grad,
+ rois,
+ spatial_scale,
+ pooled_height,
+ pooled_width,
+ batch_size,
+ channels,
+ height,
+ width,
+ sampling_ratio);
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
new file mode 100644
index 0000000..2a3d305
--- /dev/null
+++ b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
@@ -0,0 +1,522 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template
+struct PreCalc {
+ int pos1;
+ int pos2;
+ int pos3;
+ int pos4;
+ T w1;
+ T w2;
+ T w3;
+ T w4;
+};
+
+template
+void pre_calc_for_bilinear_interpolate(
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int iy_upper,
+ const int ix_upper,
+ T roi_start_h,
+ T roi_start_w,
+ T bin_size_h,
+ T bin_size_w,
+ int roi_bin_grid_h,
+ int roi_bin_grid_w,
+ T roi_center_h,
+ T roi_center_w,
+ T cos_theta,
+ T sin_theta,
+ std::vector>& pre_calc) {
+ int pre_calc_index = 0;
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ for (int iy = 0; iy < iy_upper; iy++) {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < ix_upper; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ // In image space, (y, x) is the order for Right Handed System,
+ // and this is essentially multiplying the point by a rotation matrix
+ // to rotate it counterclockwise through angle theta.
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+ // deal with: inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ PreCalc pc;
+ pc.pos1 = 0;
+ pc.pos2 = 0;
+ pc.pos3 = 0;
+ pc.pos4 = 0;
+ pc.w1 = 0;
+ pc.w2 = 0;
+ pc.w3 = 0;
+ pc.w4 = 0;
+ pre_calc[pre_calc_index] = pc;
+ pre_calc_index += 1;
+ continue;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+ if (x < 0) {
+ x = 0;
+ }
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ // save weights and indices
+ PreCalc pc;
+ pc.pos1 = y_low * width + x_low;
+ pc.pos2 = y_low * width + x_high;
+ pc.pos3 = y_high * width + x_low;
+ pc.pos4 = y_high * width + x_high;
+ pc.w1 = w1;
+ pc.w2 = w2;
+ pc.w3 = w3;
+ pc.w4 = w4;
+ pre_calc[pre_calc_index] = pc;
+
+ pre_calc_index += 1;
+ }
+ }
+ }
+ }
+}
+
+template
+void bilinear_interpolate_gradient(
+ const int height,
+ const int width,
+ T y,
+ T x,
+ T& w1,
+ T& w2,
+ T& w3,
+ T& w4,
+ int& x_low,
+ int& x_high,
+ int& y_low,
+ int& y_high) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+
+ if (x < 0) {
+ x = 0;
+ }
+
+ y_low = (int)y;
+ x_low = (int)x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = input[y_low * width + x_low];
+ // T v2 = input[y_low * width + x_high];
+ // T v3 = input[y_high * width + x_low];
+ // T v4 = input[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+
+template
+inline void add(T* address, const T& val) {
+ *address += val;
+}
+
+} // namespace
+
+template
+void ROIAlignRotatedForward(
+ const int nthreads,
+ const T* input,
+ const T& spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ const T* rois,
+ T* output) {
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
+ // (n, c, ph, pw) is an element in the pooled output
+ // can be parallelized using omp
+ // #pragma omp parallel for num_threads(32)
+ for (int n = 0; n < n_rois; n++) {
+ int index_n = n * channels * pooled_width * pooled_height;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ AT_ASSERTM(
+ roi_width >= 0 && roi_height >= 0,
+ "ROIs in ROIAlignRotated do not have non-negative size!");
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+ // we want to precalculate indices and weights shared by all channels,
+ // this is the key point of optimization
+ std::vector> pre_calc(
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ pre_calc_for_bilinear_interpolate(
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ roi_start_h,
+ roi_start_w,
+ bin_size_h,
+ bin_size_w,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ roi_center_h,
+ roi_center_w,
+ cos_theta,
+ sin_theta,
+ pre_calc);
+
+ for (int c = 0; c < channels; c++) {
+ int index_n_c = index_n + c * pooled_width * pooled_height;
+ const T* offset_input =
+ input + (roi_batch_ind * channels + c) * height * width;
+ int pre_calc_index = 0;
+
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ int index = index_n_c + ph * pooled_width + pw;
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ PreCalc pc = pre_calc[pre_calc_index];
+ output_val += pc.w1 * offset_input[pc.pos1] +
+ pc.w2 * offset_input[pc.pos2] +
+ pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+ pre_calc_index += 1;
+ }
+ }
+ output_val /= count;
+
+ output[index] = output_val;
+ } // for pw
+ } // for ph
+ } // for c
+ } // for n
+}
+
+template
+void ROIAlignRotatedBackward(
+ const int nthreads,
+ // may not be contiguous. should index using n_stride, etc
+ const T* grad_output,
+ const T& spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ T* grad_input,
+ const T* rois,
+ const int n_stride,
+ const int c_stride,
+ const int h_stride,
+ const int w_stride) {
+ for (int index = 0; index < nthreads; index++) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ AT_ASSERTM(
+ roi_width >= 0 && roi_height >= 0,
+ "ROIs in ROIAlignRotated do not have non-negative size!");
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T* offset_grad_input =
+ grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+ int output_offset = n * n_stride + c * c_stride;
+ const T* offset_grad_output = grad_output + output_offset;
+ const T grad_output_this_bin =
+ offset_grad_output[ph * h_stride + pw * w_stride];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(
+ height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+ T g1 = grad_output_this_bin * w1 / count;
+ T g2 = grad_output_this_bin * w2 / count;
+ T g3 = grad_output_this_bin * w3 / count;
+ T g4 = grad_output_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+ // atomic add is not needed for now since it is single threaded
+ add(offset_grad_input + y_low * width + x_low, static_cast(g1));
+ add(offset_grad_input + y_low * width + x_high, static_cast(g2));
+ add(offset_grad_input + y_high * width + x_low, static_cast(g3));
+ add(offset_grad_input + y_high * width + x_high, static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+ AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+ at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+ at::CheckedFrom c = "ROIAlign_forward_cpu";
+ at::checkAllSameType(c, {input_t, rois_t});
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ at::Tensor output = at::zeros(
+ {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+ if (output.numel() == 0) {
+ return output;
+ }
+
+ auto input_ = input.contiguous(), rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ input.scalar_type(), "ROIAlignRotated_forward", [&] {
+ ROIAlignRotatedForward(
+ output_size,
+ input_.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois_.data_ptr(),
+ output.data_ptr());
+ });
+ return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+ AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+ at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+ at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+ at::checkAllSameType(c, {grad_t, rois_t});
+
+ at::Tensor grad_input =
+ at::zeros({batch_size, channels, height, width}, grad.options());
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ return grad_input;
+ }
+
+ // get stride values to ensure indexing into gradients is correct.
+ int n_stride = grad.stride(0);
+ int c_stride = grad.stride(1);
+ int h_stride = grad.stride(2);
+ int w_stride = grad.stride(3);
+
+ auto rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ grad.scalar_type(), "ROIAlignRotated_forward", [&] {
+ ROIAlignRotatedBackward(
+ grad.numel(),
+ grad.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ grad_input.data_ptr(),
+ rois_.data_ptr(),
+ n_stride,
+ c_stride,
+ h_stride,
+ w_stride);
+ });
+ return grad_input;
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
new file mode 100644
index 0000000..fca1865
--- /dev/null
+++ b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
@@ -0,0 +1,443 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include
+#include
+#include
+#include
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template
+__device__ T bilinear_interpolate(
+ const T* input,
+ const int height,
+ const int width,
+ T y,
+ T x) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ return 0;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+
+ if (x < 0) {
+ x = 0;
+ }
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ // do bilinear interpolation
+ T v1 = input[y_low * width + x_low];
+ T v2 = input[y_low * width + x_high];
+ T v3 = input[y_high * width + x_low];
+ T v4 = input[y_high * width + x_high];
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ return val;
+}
+
+template
+__device__ void bilinear_interpolate_gradient(
+ const int height,
+ const int width,
+ T y,
+ T x,
+ T& w1,
+ T& w2,
+ T& w3,
+ T& w4,
+ int& x_low,
+ int& x_high,
+ int& y_low,
+ int& y_high) {
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y < 0) {
+ y = 0;
+ }
+
+ if (x < 0) {
+ x = 0;
+ }
+
+ y_low = (int)y;
+ x_low = (int)x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = input[y_low * width + x_low];
+ // T v2 = input[y_low * width + x_high];
+ // T v3 = input[y_high * width + x_low];
+ // T v4 = input[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+
+} // namespace
+
+template
+__global__ void RoIAlignRotatedForward(
+ const int nthreads,
+ const T* input,
+ const T spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ const T* rois,
+ T* top_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ const T* offset_input =
+ input + (roi_batch_ind * channels + c) * height * width;
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ // We do average (inte gral) pooling inside a bin
+ const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+ {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+ T val = bilinear_interpolate(offset_input, height, width, y, x);
+ output_val += val;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ }
+}
+
+template
+__global__ void RoIAlignRotatedBackwardFeature(
+ const int nthreads,
+ const T* top_diff,
+ const int num_rois,
+ const T spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ T* bottom_diff,
+ const T* rois) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* current_roi = rois + n * 6;
+ int roi_batch_ind = current_roi[0];
+
+ // Do not use rounding; this implementation detail is critical
+ // ROIAlignRotated supports align == true, i.e., continuous coordinate
+ // by default, thus the 0.5 offset
+ T offset = (T)0.5;
+ T roi_center_w = current_roi[1] * spatial_scale - offset;
+ T roi_center_h = current_roi[2] * spatial_scale - offset;
+ T roi_width = current_roi[3] * spatial_scale;
+ T roi_height = current_roi[4] * spatial_scale;
+ T theta = current_roi[5] * M_PI / 180.0;
+ T cos_theta = cos(theta);
+ T sin_theta = sin(theta);
+
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T* offset_bottom_diff =
+ bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T* offset_top_diff = top_diff + top_offset;
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+ // Appropriate translation needs to be applied after.
+ T roi_start_h = -roi_height / 2.0;
+ T roi_start_w = -roi_width / 2.0;
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+ {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ // Rotate by theta around the center and translate
+ T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+ T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(
+ height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+ T g1 = top_diff_this_bin * w1 / count;
+ T g2 = top_diff_this_bin * w2 / count;
+ T g3 = top_diff_this_bin * w3 / count;
+ T g4 = top_diff_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+ atomicAdd(
+ offset_bottom_diff + y_low * width + x_low, static_cast(g1));
+ atomicAdd(
+ offset_bottom_diff + y_low * width + x_high, static_cast(g2));
+ atomicAdd(
+ offset_bottom_diff + y_high * width + x_low, static_cast(g3));
+ atomicAdd(
+ offset_bottom_diff + y_high * width + x_high, static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+ at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+ at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+ at::checkAllSameGPU(c, {input_t, rois_t});
+ at::checkAllSameType(c, {input_t, rois_t});
+ at::cuda::CUDAGuard device_guard(input.device());
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty(
+ {num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(
+ at::cuda::ATenCeilDiv(
+ static_cast(output_size), static_cast(512)),
+ static_cast(4096)));
+ dim3 block(512);
+
+ if (output.numel() == 0) {
+ AT_CUDA_CHECK(cudaGetLastError());
+ return output;
+ }
+
+ auto input_ = input.contiguous(), rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES(
+ input.scalar_type(), "ROIAlignRotated_forward", [&] {
+ RoIAlignRotatedForward<<>>(
+ output_size,
+ input_.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois_.data_ptr(),
+ output.data_ptr());
+ });
+ cudaDeviceSynchronize();
+ AT_CUDA_CHECK(cudaGetLastError());
+ return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+ const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+ at::CheckedFrom c = "ROIAlign_backward_cuda";
+ at::checkAllSameGPU(c, {grad_t, rois_t});
+ at::checkAllSameType(c, {grad_t, rois_t});
+ at::cuda::CUDAGuard device_guard(grad.device());
+
+ auto num_rois = rois.size(0);
+ auto grad_input =
+ at::zeros({batch_size, channels, height, width}, grad.options());
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(
+ at::cuda::ATenCeilDiv(
+ static_cast(grad.numel()), static_cast(512)),
+ static_cast(4096)));
+ dim3 block(512);
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ AT_CUDA_CHECK(cudaGetLastError());
+ return grad_input;
+ }
+
+ auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+ AT_DISPATCH_FLOATING_TYPES(
+ grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+ RoIAlignRotatedBackwardFeature<<>>(
+ grad.numel(),
+ grad_.data_ptr(),
+ num_rois,
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ grad_input.data_ptr(),
+ rois_.data_ptr());
+ });
+ AT_CUDA_CHECK(cudaGetLastError());
+ return grad_input;
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated.h b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated.h
new file mode 100644
index 0000000..3bf383b
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated.h
@@ -0,0 +1,35 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include
+
+namespace detectron2 {
+
+at::Tensor box_iou_rotated_cpu(
+ const at::Tensor& boxes1,
+ const at::Tensor& boxes2);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor box_iou_rotated_cuda(
+ const at::Tensor& boxes1,
+ const at::Tensor& boxes2);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor box_iou_rotated(
+ const at::Tensor& boxes1,
+ const at::Tensor& boxes2) {
+ assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+ if (boxes1.device().is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+ return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous());
+#else
+ AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+ }
+
+ return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous());
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
new file mode 100644
index 0000000..c843487
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "box_iou_rotated.h"
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+template
+void box_iou_rotated_cpu_kernel(
+ const at::Tensor& boxes1,
+ const at::Tensor& boxes2,
+ at::Tensor& ious) {
+ auto num_boxes1 = boxes1.size(0);
+ auto num_boxes2 = boxes2.size(0);
+
+ for (int i = 0; i < num_boxes1; i++) {
+ for (int j = 0; j < num_boxes2; j++) {
+ ious[i * num_boxes2 + j] = single_box_iou_rotated(
+ boxes1[i].data_ptr(), boxes2[j].data_ptr());
+ }
+ }
+}
+
+at::Tensor box_iou_rotated_cpu(
+ // input must be contiguous:
+ const at::Tensor& boxes1,
+ const at::Tensor& boxes2) {
+ auto num_boxes1 = boxes1.size(0);
+ auto num_boxes2 = boxes2.size(0);
+ at::Tensor ious =
+ at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+ box_iou_rotated_cpu_kernel(boxes1, boxes2, ious);
+
+ // reshape from 1d array to 2d array
+ auto shape = std::vector{num_boxes1, num_boxes2};
+ return ious.reshape(shape);
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000..952710e
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include
+#include
+#include
+#include
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template
+__global__ void box_iou_rotated_cuda_kernel(
+ const int n_boxes1,
+ const int n_boxes2,
+ const T* dev_boxes1,
+ const T* dev_boxes2,
+ T* dev_ious) {
+ const int row_start = blockIdx.x * blockDim.x;
+ const int col_start = blockIdx.y * blockDim.y;
+
+ const int row_size = min(n_boxes1 - row_start, blockDim.x);
+ const int col_size = min(n_boxes2 - col_start, blockDim.y);
+
+ __shared__ float block_boxes1[BLOCK_DIM_X * 5];
+ __shared__ float block_boxes2[BLOCK_DIM_Y * 5];
+
+ // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+ if (threadIdx.x < row_size && threadIdx.y == 0) {
+ block_boxes1[threadIdx.x * 5 + 0] =
+ dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
+ block_boxes1[threadIdx.x * 5 + 1] =
+ dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
+ block_boxes1[threadIdx.x * 5 + 2] =
+ dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
+ block_boxes1[threadIdx.x * 5 + 3] =
+ dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
+ block_boxes1[threadIdx.x * 5 + 4] =
+ dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
+ }
+
+ if (threadIdx.x < col_size && threadIdx.y == 0) {
+ block_boxes2[threadIdx.x * 5 + 0] =
+ dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
+ block_boxes2[threadIdx.x * 5 + 1] =
+ dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
+ block_boxes2[threadIdx.x * 5 + 2] =
+ dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
+ block_boxes2[threadIdx.x * 5 + 3] =
+ dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
+ block_boxes2[threadIdx.x * 5 + 4] =
+ dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size && threadIdx.y < col_size) {
+ int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
+ dev_ious[offset] = single_box_iou_rotated(
+ block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+ }
+}
+
+at::Tensor box_iou_rotated_cuda(
+ // input must be contiguous
+ const at::Tensor& boxes1,
+ const at::Tensor& boxes2) {
+ using scalar_t = float;
+ AT_ASSERTM(
+ boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor");
+ AT_ASSERTM(
+ boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor");
+ AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+ AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+ at::cuda::CUDAGuard device_guard(boxes1.device());
+
+ auto num_boxes1 = boxes1.size(0);
+ auto num_boxes2 = boxes2.size(0);
+
+ at::Tensor ious =
+ at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+ bool transpose = false;
+ if (num_boxes1 > 0 && num_boxes2 > 0) {
+ scalar_t *data1 = boxes1.data_ptr(),
+ *data2 = boxes2.data_ptr();
+
+ if (num_boxes2 > 65535 * BLOCK_DIM_Y) {
+ AT_ASSERTM(
+ num_boxes1 <= 65535 * BLOCK_DIM_Y,
+ "Too many boxes for box_iou_rotated_cuda!");
+ // x dim is allowed to be large, but y dim cannot,
+ // so we transpose the two to avoid "invalid configuration argument"
+ // error. We assume one of them is small. Otherwise the result is hard to
+ // fit in memory anyway.
+ std::swap(num_boxes1, num_boxes2);
+ std::swap(data1, data2);
+ transpose = true;
+ }
+
+ const int blocks_x =
+ at::cuda::ATenCeilDiv(static_cast(num_boxes1), BLOCK_DIM_X);
+ const int blocks_y =
+ at::cuda::ATenCeilDiv(static_cast(num_boxes2), BLOCK_DIM_Y);
+
+ dim3 blocks(blocks_x, blocks_y);
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ box_iou_rotated_cuda_kernel<<>>(
+ num_boxes1,
+ num_boxes2,
+ data1,
+ data2,
+ (scalar_t*)ious.data_ptr());
+
+ AT_CUDA_CHECK(cudaGetLastError());
+ }
+
+ // reshape from 1d array to 2d array
+ auto shape = std::vector{num_boxes1, num_boxes2};
+ if (transpose) {
+ return ious.view(shape).t();
+ } else {
+ return ious.view(shape);
+ }
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
new file mode 100644
index 0000000..bc6967a
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
@@ -0,0 +1,391 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include
+#include
+
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace detectron2 {
+
+namespace {
+
+template
+struct RotatedBox {
+ T x_ctr, y_ctr, w, h, a;
+};
+
+template
+struct Point {
+ T x, y;
+ HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+ HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+ return Point(x + p.x, y + p.y);
+ }
+ HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+ x += p.x;
+ y += p.y;
+ return *this;
+ }
+ HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+ return Point(x - p.x, y - p.y);
+ }
+ HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+ return Point(x * coeff, y * coeff);
+ }
+};
+
+template
+HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) {
+ return A.x * B.x + A.y * B.y;
+}
+
+// R: result type. can be different from input type
+template
+HOST_DEVICE_INLINE R cross_2d(const Point& A, const Point& B) {
+ return static_cast(A.x) * static_cast(B.y) -
+ static_cast(B.x) * static_cast(A.y);
+}
+
+template
+HOST_DEVICE_INLINE void get_rotated_vertices(
+ const RotatedBox& box,
+ Point (&pts)[4]) {
+ // M_PI / 180. == 0.01745329251
+ double theta = box.a * 0.01745329251;
+ T cosTheta2 = (T)cos(theta) * 0.5f;
+ T sinTheta2 = (T)sin(theta) * 0.5f;
+
+ // y: top --> down; x: left --> right
+ pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w;
+ pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+ pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w;
+ pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+ pts[2].x = 2 * box.x_ctr - pts[0].x;
+ pts[2].y = 2 * box.y_ctr - pts[0].y;
+ pts[3].x = 2 * box.x_ctr - pts[1].x;
+ pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template
+HOST_DEVICE_INLINE int get_intersection_points(
+ const Point (&pts1)[4],
+ const Point (&pts2)[4],
+ Point (&intersections)[24]) {
+ // Line vector
+ // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+ Point vec1[4], vec2[4];
+ for (int i = 0; i < 4; i++) {
+ vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+ vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+ }
+
+ // When computing the intersection area, it doesn't hurt if we have
+ // more (duplicated/approximate) intersections/vertices than needed,
+ // while it can cause drastic difference if we miss an intersection/vertex.
+ // Therefore, we add an epsilon to relax the comparisons between
+ // the float point numbers that decide the intersection points.
+ double EPS = 1e-5;
+
+ // Line test - test all line combos for intersection
+ int num = 0; // number of intersections
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 4; j++) {
+ // Solve for 2x2 Ax=b
+ T det = cross_2d(vec2[j], vec1[i]);
+
+ // This takes care of parallel lines
+ if (fabs(det) <= 1e-14) {
+ continue;
+ }
+
+ auto vec12 = pts2[j] - pts1[i];
+
+ T t1 = cross_2d(vec2[j], vec12) / det;
+ T t2 = cross_2d(vec1[i], vec12) / det;
+
+ if (t1 > -EPS && t1 < 1.0f + EPS && t2 > -EPS && t2 < 1.0f + EPS) {
+ intersections[num++] = pts1[i] + vec1[i] * t1;
+ }
+ }
+ }
+
+ // Check for vertices of rect1 inside rect2
+ {
+ const auto& AB = vec2[0];
+ const auto& DA = vec2[3];
+ auto ABdotAB = dot_2d(AB, AB);
+ auto ADdotAD = dot_2d(DA, DA);
+ for (int i = 0; i < 4; i++) {
+ // assume ABCD is the rectangle, and P is the point to be judged
+ // P is inside ABCD iff. P's projection on AB lies within AB
+ // and P's projection on AD lies within AD
+
+ auto AP = pts1[i] - pts2[0];
+
+ auto APdotAB = dot_2d(AP, AB);
+ auto APdotAD = -dot_2d(AP, DA);
+
+ if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+ (APdotAD < ADdotAD + EPS)) {
+ intersections[num++] = pts1[i];
+ }
+ }
+ }
+
+ // Reverse the check - check for vertices of rect2 inside rect1
+ {
+ const auto& AB = vec1[0];
+ const auto& DA = vec1[3];
+ auto ABdotAB = dot_2d(AB, AB);
+ auto ADdotAD = dot_2d(DA, DA);
+ for (int i = 0; i < 4; i++) {
+ auto AP = pts2[i] - pts1[0];
+
+ auto APdotAB = dot_2d(AP, AB);
+ auto APdotAD = -dot_2d(AP, DA);
+
+ if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+ (APdotAD < ADdotAD + EPS)) {
+ intersections[num++] = pts2[i];
+ }
+ }
+ }
+
+ return num;
+}
+
+template
+HOST_DEVICE_INLINE int convex_hull_graham(
+ const Point (&p)[24],
+ const int& num_in,
+ Point (&q)[24],
+ bool shift_to_zero = false) {
+ assert(num_in >= 2);
+
+ // Step 1:
+ // Find point with minimum y
+ // if more than 1 points have the same minimum y,
+ // pick the one with the minimum x.
+ int t = 0;
+ for (int i = 1; i < num_in; i++) {
+ if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+ t = i;
+ }
+ }
+ auto& start = p[t]; // starting point
+
+ // Step 2:
+ // Subtract starting point from every points (for sorting in the next step)
+ for (int i = 0; i < num_in; i++) {
+ q[i] = p[i] - start;
+ }
+
+ // Swap the starting point to position 0
+ auto tmp = q[0];
+ q[0] = q[t];
+ q[t] = tmp;
+
+ // Step 3:
+ // Sort point 1 ~ num_in according to their relative cross-product values
+ // (essentially sorting according to angles)
+ // If the angles are the same, sort according to their distance to origin
+ T dist[24];
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+ // compute distance to origin before sort, and sort them together with the
+ // points
+ for (int i = 0; i < num_in; i++) {
+ dist[i] = dot_2d(q[i], q[i]);
+ }
+
+ // CUDA version
+ // In the future, we can potentially use thrust
+ // for sorting here to improve speed (though not guaranteed)
+ for (int i = 1; i < num_in - 1; i++) {
+ for (int j = i + 1; j < num_in; j++) {
+ T crossProduct = cross_2d(q[i], q[j]);
+ if ((crossProduct < -1e-6) ||
+ (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+ auto q_tmp = q[i];
+ q[i] = q[j];
+ q[j] = q_tmp;
+ auto dist_tmp = dist[i];
+ dist[i] = dist[j];
+ dist[j] = dist_tmp;
+ }
+ }
+ }
+#else
+ // CPU version
+ // std::sort(
+ // q + 1, q + num_in, [](const Point& A, const Point& B) -> bool {
+ // T temp = cross_2d(A, B);
+
+ // if (fabs(temp) < 1e-6) {
+ // return dot_2d(A, A) < dot_2d(B, B);
+ // } else {
+ // return temp > 0;
+ // }
+ // });
+ for (int i = 0; i < num_in; i++) {
+ dist[i] = dot_2d(q[i], q[i]);
+ }
+
+ for (int i = 1; i < num_in - 1; i++) {
+ for (int j = i + 1; j < num_in; j++) {
+ T crossProduct = cross_2d